/rgbdslam/external/siftgpu/src/SiftGPU/ProgramCU.cu - Annotate - Quadrotor - Robotics Club

9240aaa3

Alex

////////////////////////////////////////////////////////////////////////////

2

//        File:                ProgramCU.cu

3

//        Author:                Changchang Wu

4

//        Description : implementation of ProgramCU and all CUDA kernels

5

//

6

//        Copyright (c) 2007 University of North Carolina at Chapel Hill

7

//        All Rights Reserved

8

//

9

//        Permission to use, copy, modify and distribute this software and its

10

//        documentation for educational, research and non-profit purposes, without

11

//        fee, and without a written agreement is hereby granted, provided that the

12

//        above copyright notice and the following paragraph appear in all copies.

13

//

14

//        The University of North Carolina at Chapel Hill make no representations

15

//        about the suitability of this software for any purpose. It is provided

16

//        'as is' without express or implied warranty.

17

//

18

//        Please send BUG REPORTS to ccwu@cs.unc.edu

19

//

20

////////////////////////////////////////////////////////////////////////////

21

22

#if defined(CUDA_SIFTGPU_ENABLED)

23

24

#include "GL/glew.h"

25

#include "stdio.h"

26

27

#include "CuTexImage.h"

28

#include "ProgramCU.h"

29

#include "GlobalUtil.h"

30

31

//----------------------------------------------------------------

32

//Begin SiftGPU setting section.

33

//////////////////////////////////////////////////////////

34

#define IMUL(X,Y) __mul24(X,Y)

35

//#define FDIV(X,Y) ((X)/(Y))

36

#define FDIV(X,Y) __fdividef(X,Y)

37

38

/////////////////////////////////////////////////////////

39

//filter kernel width range (don't change this)

40

#define KERNEL_MAX_WIDTH 33

41

#define KERNEL_MIN_WIDTH 5

42

43

//////////////////////////////////////////////////////////

44

//horizontal filter block size (32, 64, 128, 256, 512)

45

#define FILTERH_TILE_WIDTH 128

46

//thread block for vertical filter. FILTERV_BLOCK_WIDTH can be (4, 8 or 16)

47

#define FILTERV_BLOCK_WIDTH 16

48

#define FILTERV_BLOCK_HEIGHT 32

49

//The corresponding image patch for a thread block

50

#define FILTERV_PIXEL_PER_THREAD 4

51

#define FILTERV_TILE_WIDTH FILTERV_BLOCK_WIDTH

52

#define FILTERV_TILE_HEIGHT (FILTERV_PIXEL_PER_THREAD * FILTERV_BLOCK_HEIGHT)

53

54

55

//////////////////////////////////////////////////////////

56

//thread block size for computing Difference of Gaussian

57

#define DOG_BLOCK_LOG_DIMX 7

58

#define DOG_BLOCK_LOG_DIMY 0

59

#define DOG_BLOCK_DIMX (1 << DOG_BLOCK_LOG_DIMX)

60

#define DOG_BLOCK_DIMY (1 << DOG_BLOCK_LOG_DIMY)

61

62

//////////////////////////////////////////////////////////

63

//thread block size for keypoint detection

64

#define KEY_BLOCK_LOG_DIMX 3

65

#define KEY_BLOCK_LOG_DIMY 3

66

#define KEY_BLOCK_DIMX (1<<KEY_BLOCK_LOG_DIMX)

67

#define KEY_BLOCK_DIMY (1<<KEY_BLOCK_LOG_DIMY)

68

//#define KEY_OFFSET_ONE

69

//make KEY_BLOCK_LOG_DIMX 4 will make the write coalesced..

70

//but it seems uncoalesced writes don't affect the speed

71

72

//////////////////////////////////////////////////////////

73

//thread block size for initializing list generation (64, 128, 256, 512 ...)

74

#define HIST_INIT_WIDTH 128

75

//thread block size for generating feature list (32, 64, 128, 256, 512, ...)

76

#define LISTGEN_BLOCK_DIM 128

77

78

79

/////////////////////////////////////////////////////////

80

//how many keypoint orientations to compute in a block

81

#define ORIENTATION_COMPUTE_PER_BLOCK 64

82

//how many keypoint descriptor to compute in a block (2, 4, 8, 16, 32)

83

#define DESCRIPTOR_COMPUTE_PER_BLOCK        4

84

#define DESCRIPTOR_COMPUTE_BLOCK_SIZE        (16 * DESCRIPTOR_COMPUTE_PER_BLOCK)

85

//how many keypoint descriptor to normalized in a block (32, ...)

86

#define DESCRIPTOR_NORMALIZ_PER_BLOCK        32

///////////////////////////////////////////

91

//Thread block size for visualization

92

//(This doesn't affect the speed of computation)

93

#define BLOCK_LOG_DIM 4

94

#define BLOCK_DIM (1 << BLOCK_LOG_DIM)

95

96

//End SiftGPU setting section.

97

//----------------------------------------------------------------

98

99

100

__device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];

101

texture<float, 1, cudaReadModeElementType> texData;

102

texture<unsigned char, 1, cudaReadModeNormalizedFloat> texDataB;

103

texture<float2, 2, cudaReadModeElementType> texDataF2;

104

texture<float4, 1, cudaReadModeElementType> texDataF4;

105

texture<int4, 1, cudaReadModeElementType> texDataI4;

106

texture<int4, 1, cudaReadModeElementType> texDataList;

107

108

//template<int i>         __device__ float Conv(float *data)                {    return Conv<i-1>(data) + data[i]*d_kernel[i];}

109

//template<>                __device__ float Conv<0>(float *data)        {    return data[0] * d_kernel[0];                                        }

110

111

112

//////////////////////////////////////////////////////////////

113

template<int FW> __global__ void FilterH( float* d_result, int width)

114

115

116

        const int HALF_WIDTH = FW >> 1;

117

        const int CACHE_WIDTH = FILTERH_TILE_WIDTH + FW -1;

118

        const int CACHE_COUNT = 2 + (CACHE_WIDTH - 2)/ FILTERH_TILE_WIDTH;

119

        __shared__ float data[CACHE_WIDTH];

120

        const int bcol = IMUL(blockIdx.x, FILTERH_TILE_WIDTH);

121

        const int col =  bcol + threadIdx.x;

122

        const int index_min = IMUL(blockIdx.y, width);

123

        const int index_max = index_min + width - 1;

124

        int src_index = index_min + bcol - HALF_WIDTH + threadIdx.x;

125

        int cache_index = threadIdx.x;

126

        float value = 0;

127

#pragma unroll

128

        for(int j = 0; j < CACHE_COUNT; ++j)

129

130

                if(cache_index < CACHE_WIDTH)

131

132

                        int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);

133

                        data[cache_index] = tex1Dfetch(texData,fetch_index);

134

                        src_index += FILTERH_TILE_WIDTH;

135

                        cache_index += FILTERH_TILE_WIDTH;

136

137

138

        __syncthreads();

139

        if(col >= width) return;

140

#pragma unroll

141

        for(int i = 0; i < FW; ++i)

142

143

                value += (data[threadIdx.x + i]* d_kernel[i]);

144

145

//        value = Conv<FW-1>(data + threadIdx.x);

146

        d_result[index_min + col] = value;

////////////////////////////////////////////////////////////////////

152

template<int  FW>  __global__ void FilterV(float* d_result, int width, int height)

153

154

        const int HALF_WIDTH = FW >> 1;

155

        const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1;

156

        const int TEMP = CACHE_WIDTH & 0xf;

157

//add some extra space to avoid bank conflict

158

#if FILTERV_TILE_WIDTH == 16

159

        //make the stride 16 * n +/- 1

160

        const int EXTRA = (TEMP == 1 || TEMP == 0) ? 1 - TEMP : 15 - TEMP;

161

#elif FILTERV_TILE_WIDTH == 8

162

        //make the stride 16 * n +/- 2

163

        const int EXTRA = (TEMP == 2 || TEMP == 1 || TEMP == 0) ? 2 - TEMP : (TEMP == 15? 3 : 14 - TEMP);

164

#elif FILTERV_TILE_WIDTH == 4

165

        //make the stride 16 * n +/- 4

166

        const int EXTRA = (TEMP >=0 && TEMP <=4) ? 4 - TEMP : (TEMP > 12? 20 - TEMP : 12 - TEMP);

167

#else

168

#error

169

#endif

170

        const int CACHE_TRUE_WIDTH = CACHE_WIDTH + EXTRA;

171

        const int CACHE_COUNT = (CACHE_WIDTH + FILTERV_BLOCK_HEIGHT - 1) / FILTERV_BLOCK_HEIGHT;

172

        const int WRITE_COUNT = (FILTERV_TILE_HEIGHT + FILTERV_BLOCK_HEIGHT -1) / FILTERV_BLOCK_HEIGHT;

173

        __shared__ float data[CACHE_TRUE_WIDTH * FILTERV_TILE_WIDTH];

174

        const int row_block_first = IMUL(blockIdx.y, FILTERV_TILE_HEIGHT);

175

        const int col = IMUL(blockIdx.x, FILTERV_TILE_WIDTH) + threadIdx.x;

176

        const int row_first = row_block_first - HALF_WIDTH;

177

        const int data_index_max = IMUL(height - 1, width) + col;

178

        const int cache_col_start = threadIdx.y;

179

        const int cache_row_start = IMUL(threadIdx.x, CACHE_TRUE_WIDTH);

180

        int cache_index = cache_col_start + cache_row_start;

181

        int data_index = IMUL(row_first + cache_col_start, width) + col;

182

183

        if(col < width)

184

185

#pragma unroll

186

                for(int i = 0; i < CACHE_COUNT; ++i)

187

188

                        if(cache_col_start < CACHE_WIDTH - i * FILTERV_BLOCK_HEIGHT)

189

190

                                int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);

191

                                data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch(texData,fetch_index);

192

                                data_index += IMUL(FILTERV_BLOCK_HEIGHT, width);

        __syncthreads();

197

198

        if(col >= width) return;

199

200

        int row = row_block_first + threadIdx.y;

201

        int index_start = cache_row_start + threadIdx.y;

202

#pragma unroll

203

        for(int i = 0; i < WRITE_COUNT;                ++i,

204

                        row += FILTERV_BLOCK_HEIGHT, index_start += FILTERV_BLOCK_HEIGHT)

205

206

                if(row < height)

207

208

                        int index_dest = IMUL(row, width) + col;

209

                        float value = 0;

210

#pragma unroll

211

                        for(int i = 0; i < FW; ++i)

212

213

                                value += (data[index_start + i] * d_kernel[i]);

214

215

                        d_result[index_dest] = value;

template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int width)

222

223

        const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1);

224

        const float INV_SCALE = 1.0f / (float(SCALE));

225

        int col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;

226

        if(col >= width) return;

227

228

        int row = blockIdx.y >> LOG_SCALE;

229

        int index = row * width + col;

230

        int dst_row = blockIdx.y;

231

        int dst_idx= (width * dst_row + col) * SCALE;

232

        int helper = blockIdx.y & SCALE_MASK;

233

        if (helper)

234

235

                float v11 = tex1Dfetch(texData, index);

236

                float v12 = tex1Dfetch(texData, index + 1);

237

                index += width;

238

                float v21 = tex1Dfetch(texData, index);

239

                float v22 = tex1Dfetch(texData, index + 1);

240

                float w1 = INV_SCALE * helper, w2 = 1.0 - w1;

241

                float v1 = (v21 * w1  + w2 * v11);

242

                float v2 = (v22 * w1  + w2 * v12);

243

                d_result[dst_idx] = v1;

244

#pragma unroll

245

                for(int i = 1; i < SCALE; ++i)

246

247

                        const float r2 = i * INV_SCALE;

248

                        const float r1 = 1.0f - r2;

249

                        d_result[dst_idx +i] = v1 * r1 + v2 * r2;

250

251

        }else

252

253

                float v1 = tex1Dfetch(texData, index);

254

                float v2 = tex1Dfetch(texData, index + 1);

255

                d_result[dst_idx] = v1;

256

#pragma unroll

257

                for(int i = 1; i < SCALE; ++i)

258

259

                        const float r2 = i * INV_SCALE;

260

                        const float r1 = 1.0f - r2;

261

                        d_result[dst_idx +i] = v1 * r1 + v2 * r2;

////////////////////////////////////////////////////////////////////////////////////////

268

void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale)

269

270

        int width = src->GetImgWidth(), height = src->GetImgHeight();

271

        src->BindTexture(texData);

272

        dim3 grid((width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);

273

        dim3 block(FILTERH_TILE_WIDTH);

274

        switch(log_scale)

275

276

        case 1 :         UpsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, width);        break;

277

        case 2 :         UpsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, width);        break;

278

        case 3 :         UpsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, width);        break;

279

        default:        break;

template<int LOG_SCALE> __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width)

284

285

        const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;

286

        if(dst_col >= dst_width) return;

287

        const int src_col = min((dst_col << LOG_SCALE), (src_width - 1));

288

        const int dst_row = blockIdx.y;

289

        const int src_row = blockIdx.y << LOG_SCALE;

290

        const int src_idx = IMUL(src_row, src_width) + src_col;

291

        const int dst_idx = IMUL(dst_width, dst_row) + dst_col;

292

        d_result[dst_idx] = tex1Dfetch(texData, src_idx);

__global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, const int log_scale)

297

298

        const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;

299

        if(dst_col >= dst_width) return;

300

        const int src_col = min((dst_col << log_scale), (src_width - 1));

301

        const int dst_row = blockIdx.y;

302

        const int src_row = blockIdx.y << log_scale;

303

        const int src_idx = IMUL(src_row, src_width) + src_col;

304

        const int dst_idx = IMUL(dst_width, dst_row) + dst_col;

305

        d_result[dst_idx] = tex1Dfetch(texData, src_idx);

void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale)

310

311

        int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;

312

313

        src->BindTexture(texData);

314

        dim3 grid((dst_width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());

315

        dim3 block(FILTERH_TILE_WIDTH);

316

        switch(log_scale)

317

318

        case 1 :         DownsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);        break;

319

        case 2 :        DownsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);        break;

320

        case 3 :         DownsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);        break;

321

        default:        DownsampleKernel    <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width, log_scale);

__global__ void ChannelReduce_Kernel(float* d_result)

326

327

        int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;

328

        d_result[index] = tex1Dfetch(texData, index*4);

329

330

331

__global__ void ChannelReduce_Convert_Kernel(float* d_result)

332

333

        int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;

334

        float4 rgba = tex1Dfetch(texDataF4, index);

335

        d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z;

336

337

338

void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int convert_rgb)

339

340

        int width = src->GetImgWidth(), height = dst->GetImgHeight() ;

341

342

        dim3 grid((width * height +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);

343

        dim3 block(FILTERH_TILE_WIDTH);

344

        if(convert_rgb)

345

346

                src->BindTexture(texDataF4);

347

                ChannelReduce_Convert_Kernel<<<grid, block>>>((float*)dst->_cuData);

348

        }else

349

350

                src->BindTexture(texData);

351

                ChannelReduce_Kernel<<<grid, block>>>((float*)dst->_cuData);

__global__ void ConvertByteToFloat_Kernel(float* d_result)

356

357

        int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;

358

        d_result[index] = tex1Dfetch(texDataB, index);

359

360

361

void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)

362

363

        int width = src->GetImgWidth(), height = dst->GetImgHeight() ;

364

        dim3 grid((width * height +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);

365

        dim3 block(FILTERH_TILE_WIDTH);

366

        src->BindTexture(texDataB);

367

        ConvertByteToFloat_Kernel<<<grid, block>>>((float*)dst->_cuData);

368

369

370

void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width)

371

372

        int i, sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//

373

        width = 2*sz + 1;

374

375

        if(width > KERNEL_MAX_WIDTH)

376

377

                //filter size truncation

378

                sz = KERNEL_MAX_WIDTH >> 1;

379

                width =KERNEL_MAX_WIDTH;

380

        }else if(width < KERNEL_MIN_WIDTH)

381

382

                sz = KERNEL_MIN_WIDTH >> 1;

383

                width =KERNEL_MIN_WIDTH;

384

385

386

        float   rv = 1.0f/(sigma*sigma), v, ksum =0;

387

388

        // pre-compute filter

389

        for( i = -sz ; i <= sz ; ++i)

390

391

                kernel[i+sz] =  v = exp(-0.5f * i * i *rv) ;

392

                ksum += v;

393

394

395

        //normalize the kernel

396

        rv = 1.0f/ksum;

397

        for(i = 0; i< width ;i++) kernel[i]*=rv;

template<int FW> void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf)

402

403

        int width = src->GetImgWidth(), height = src->GetImgHeight();

404

405

        //horizontal filtering

406

        src->BindTexture(texData);

407

        dim3 gridh((width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);

408

        dim3 blockh(FILTERH_TILE_WIDTH);

409

        FilterH<FW><<<gridh, blockh>>>((float*)buf->_cuData, width);

410

        CheckErrorCUDA("FilterH");

411

412

        ///vertical filtering

413

        buf->BindTexture(texData);

414

        dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH,  (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT);

415

        dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_BLOCK_HEIGHT);

416

        FilterV<FW><<<gridv, blockv>>>((float*)dst->_cuData, width, height);

417

        CheckErrorCUDA("FilterV");

418

419

420

//////////////////////////////////////////////////////////////////////

421

// tested on 2048x1500 image, the time on pyramid construction is

422

// OpenGL version : 18ms

423

// CUDA version: 28 ms

424

void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, float sigma)

425

426

        float filter_kernel[KERNEL_MAX_WIDTH]; int width;

427

        CreateFilterKernel(sigma, filter_kernel, width);

428

        cudaMemcpyToSymbol(d_kernel, filter_kernel, width * sizeof(float), 0, cudaMemcpyHostToDevice);

429

430

        switch(width)

431

432

                case 5:                FilterImage< 5>(dst, src, buf);        break;

433

                case 7:                FilterImage< 7>(dst, src, buf);        break;

434

                case 9:                FilterImage< 9>(dst, src, buf);        break;

435

                case 11:        FilterImage<11>(dst, src, buf);        break;

436

                case 13:        FilterImage<13>(dst, src, buf);        break;

437

                case 15:        FilterImage<15>(dst, src, buf);        break;

438

                case 17:        FilterImage<17>(dst, src, buf);        break;

439

                case 19:        FilterImage<19>(dst, src, buf);        break;

440

                case 21:        FilterImage<21>(dst, src, buf);        break;

441

                case 23:        FilterImage<23>(dst, src, buf);        break;

442

                case 25:        FilterImage<25>(dst, src, buf);        break;

443

                case 27:        FilterImage<27>(dst, src, buf);        break;

444

                case 29:        FilterImage<29>(dst, src, buf);        break;

445

                case 31:        FilterImage<31>(dst, src, buf);        break;

446

                case 33:        FilterImage<33>(dst, src, buf);        break;

447

                default:        break;

texture<float, 1, cudaReadModeElementType> texC;

454

texture<float, 1, cudaReadModeElementType> texP;

455

texture<float, 1, cudaReadModeElementType> texN;

456

457

void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int height)

458

459

        int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;

460

        int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;

461

        if(col < width && row < height)

462

463

                int index = IMUL(row, width) + col;

464

                float vp = tex1Dfetch(texP, index);

465

                float v = tex1Dfetch(texC, index);

466

                d_dog[index] = v - vp;

467

                float vxn = tex1Dfetch(texC, index + 1);

468

                float vxp = tex1Dfetch(texC, index - 1);

469

                float vyp = tex1Dfetch(texC, index - width);

470

                float vyn = tex1Dfetch(texC, index + width);

471

                float dx = vxn - vxp, dy = vyn - vyp;

472

                float grd = 0.5f * sqrt(dx * dx  + dy * dy);

473

                float rot = (grd == 0.0f? 0.0f : atan2(dy, dx));

474

                d_got[index] = make_float2(grd, rot);

void __global__ ComputeDOG_Kernel(float* d_dog, int width, int height)

479

480

        int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;

481

        int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;

482

        if(col < width && row < height)

483

484

                int index = IMUL(row, width) + col;

485

                float vp = tex1Dfetch(texP, index);

486

                float v = tex1Dfetch(texC, index);

487

                d_dog[index] = v - vp;

void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)

492

493

        int width = gus->GetImgWidth(), height = gus->GetImgHeight();

494

        dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX,  (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY);

495

        dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY);

496

        gus->BindTexture(texC);

497

        (gus -1)->BindTexture(texP);

498

        if(got->_cuData)

499

                ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, (float2*) got->_cuData, width, height);

500

        else

501

                ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, width, height);

#define READ_CMP_DOG_DATA(datai, tex, idx) \

506

                datai[0] = tex1Dfetch(tex, idx - 1);\

507

                datai[1] = tex1Dfetch(tex, idx);\

508

                datai[2] = tex1Dfetch(tex, idx + 1);\

509

                if(v > nmax)\

510

{\

511

                           nmax = max(nmax, datai[0]);\

512

                           nmax = max(nmax, datai[1]);\

513

                           nmax = max(nmax, datai[2]);\

514

                           if(v < nmax) goto key_finish;\

515

                }else\

516

{\

517

                           nmin = min(nmin, datai[0]);\

518

                           nmin = min(nmin, datai[1]);\

519

                           nmin = min(nmin, datai[2]);\

520

                           if(v > nmin) goto key_finish;\

void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowmax,

525

                                        float dog_threshold0,  float dog_threshold, float edge_threshold, int subpixel_localization)

526

527

       float data[3][3], v;

528

       float datap[3][3], datan[3][3];

529

#ifdef KEY_OFFSET_ONE

530

       int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y + 1;

531

       int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x + 1;

532

#else

533

       int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y;

534

       int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x;

535

#endif

536

       int index = IMUL(row, width) + col;

537

           int idx[3] ={index - width, index, index + width};

538

       int in_image =0;

539

       float nmax, nmin, result = 0.0f;

540

           float dx = 0, dy = 0, ds = 0;

541

           bool offset_test_passed = true;

542

#ifdef KEY_OFFSET_ONE

543

       if(row < rowmax && col < colmax)

544

#else

545

       if(row > 0 && col > 0 && row < rowmax && col < colmax)

546

#endif

547

548

                        in_image = 1;

549

                        data[1][1] = v = tex1Dfetch(texC, idx[1]);

550

                        if(fabs(v) <= dog_threshold0) goto key_finish;

551

552

                        data[1][0] = tex1Dfetch(texC, idx[1] - 1);

553

                        data[1][2] = tex1Dfetch(texC, idx[1] + 1);

554

                        nmax = max(data[1][0], data[1][2]);

555

                        nmin = min(data[1][0], data[1][2]);

556

557

                        if(v <=nmax && v >= nmin) goto key_finish;

558

                        //if((v > nmax && v < 0 )|| (v < nmin && v > 0)) goto key_finish;

559

                        READ_CMP_DOG_DATA(data[0], texC, idx[0]);

560

                        READ_CMP_DOG_DATA(data[2], texC, idx[2]);

561

562

                        //edge supression

563

                        float vx2 = v * 2.0f;

564

                        float fxx = data[1][0] + data[1][2] - vx2;

565

                        float fyy = data[0][1] + data[2][1] - vx2;

566

                        float fxy = 0.25f * (data[2][2] + data[0][0] - data[2][0] - data[0][2]);

567

                        float temp1 = fxx * fyy - fxy * fxy;

568

                        float temp2 = (fxx + fyy) * (fxx + fyy);

569

                        if(temp1 <=0 || temp2 > edge_threshold * temp1) goto key_finish;

570

571

572

                        //read the previous level

573

                        READ_CMP_DOG_DATA(datap[0], texP, idx[0]);

574

                        READ_CMP_DOG_DATA(datap[1], texP, idx[1]);

575

                        READ_CMP_DOG_DATA(datap[2], texP, idx[2]);

576

577

578

                        //read the next level

579

                        READ_CMP_DOG_DATA(datan[0], texN, idx[0]);

580

                        READ_CMP_DOG_DATA(datan[1], texN, idx[1]);

581

                        READ_CMP_DOG_DATA(datan[2], texN, idx[2]);

582

583

                        if(subpixel_localization)

584

585

                                //subpixel localization

586

                                float fx = 0.5f * (data[1][2] - data[1][0]);

587

                                float fy = 0.5f * (data[2][1] - data[0][1]);

588

                                float fs = 0.5f * (datan[1][1] - datap[1][1]);

589

590

                                float fss = (datan[1][1] + datap[1][1] - vx2);

591

                                float fxs = 0.25f* (datan[1][2] + datap[1][0] - datan[1][0] - datap[1][2]);

592

                                float fys = 0.25f* (datan[2][1] + datap[0][1] - datan[0][1] - datap[2][1]);

593

594

                                //need to solve dx, dy, ds;

595

                                // |-fx|     | fxx fxy fxs |   |dx|

596

                                // |-fy|  =  | fxy fyy fys | * |dy|

597

                                // |-fs|     | fxs fys fss |   |ds|

598

                                float4 A0 = fxx > 0? make_float4(fxx, fxy, fxs, -fx) : make_float4(-fxx, -fxy, -fxs, fx);

599

                                float4 A1 = fxy > 0? make_float4(fxy, fyy, fys, -fy) : make_float4(-fxy, -fyy, -fys, fy);

600

                                float4 A2 = fxs > 0? make_float4(fxs, fys, fss, -fs) : make_float4(-fxs, -fys, -fss, fs);

601

                                float maxa = max(max(A0.x, A1.x), A2.x);

602

                                if(maxa >= 1e-10)

603

604

                                        if(maxa == A1.x)

605

606

                                                float4 TEMP = A1; A1 = A0; A0 = TEMP;

607

                                        }else if(maxa == A2.x)

608

609

                                                float4 TEMP = A2; A2 = A0; A0 = TEMP;

610

611

                                        A0.y /= A0.x;        A0.z /= A0.x;        A0.w/= A0.x;

612

                                        A1.y -= A1.x * A0.y;        A1.z -= A1.x * A0.z;        A1.w -= A1.x * A0.w;

613

                                        A2.y -= A2.x * A0.y;        A2.z -= A2.x * A0.z;        A2.w -= A2.x * A0.w;

614

                                        if(abs(A2.y) > abs(A1.y))

615

616

                                                float4 TEMP = A2;        A2 = A1; A1 = TEMP;

617

618

                                        if(abs(A1.y) >= 1e-10)

619

620

                                                A1.z /= A1.y;        A1.w /= A1.y;

621

                                                A2.z -= A2.y * A1.z;        A2.w -= A2.y * A1.w;

622

                                                if(abs(A2.z) >= 1e-10)

623

624

                                                        ds = A2.w / A2.z;

625

                                                        dy = A1.w - ds * A1.z;

626

                                                        dx = A0.w - ds * A0.z - dy * A0.y;

627

628

                                                        offset_test_passed =

629

                                                                fabs(data[1][1] + 0.5f * (dx * fx + dy * fy + ds * fs)) > dog_threshold

630

                                                                &&fabs(ds) < 1.0f && fabs(dx) < 1.0f && fabs(dy) < 1.0f;

                        if(offset_test_passed) result = v > nmax ? 1.0 : -1.0;

636

637

key_finish:

638

       if(in_image) d_key[index] = make_float4(result, dx, dy, ds);

void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float Tedge)

643

644

        int width = dog->GetImgWidth(), height = dog->GetImgHeight();

645

        float Tdog1 = (GlobalUtil::_SubpixelLocalization? 0.8f : 1.0f) * Tdog;

646

        CuTexImage* dogp = dog - 1;

647

        CuTexImage* dogn = dog + 1;

648

#ifdef KEY_OFFSET_ONE

649

        dim3 grid((width - 1 + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX,  (height - 1 + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);

650

#else

651

        dim3 grid((width + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX,  (height + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);

652

#endif

653

        dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY);

654

        dogp->BindTexture(texP);

655

        dog ->BindTexture(texC);

656

        dogn->BindTexture(texN);

657

        Tedge = (Tedge+1)*(Tedge+1)/Tedge;

658

        ComputeKEY_Kernel<<<grid, block>>>((float4*) key->_cuData, width,

659

        width -1, height -1, Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization);

void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height)

666

667

       int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;

668

       int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;

669

           if(row < height && col < wd)

670

671

                        int hidx = IMUL(row, wd) + col;

672

                        int scol = col << 2;

673

                        int sidx = IMUL(row, ws) + scol;

674

                        int v[4] = {0, 0, 0, 0};

675

                        if(row > 0 && row < height -1)

676

677

#pragma unroll

678

                                for(int i = 0; i < 4 ; ++i, ++scol)

679

680

                                        float4 temp = tex1Dfetch(texDataF4, sidx +i);

681

                                        v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0;

682

683

684

                        hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);

void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)

692

693

        int ws = key->GetImgWidth(), hs = key->GetImgHeight();

694

        int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();

695

        dim3 grid((wd  + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH,  hd);

696

        dim3 block(HIST_INIT_WIDTH, 1);

697

        key->BindTexture(texDataF4);

698

        InitHist_Kernel<<<grid, block>>>((int4*) hist->_cuData, ws, wd, hd);

void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height)

704

705

       int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;

706

       int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;

707

           if(row < height && col < wd)

708

709

                        int hidx = IMUL(row, wd) + col;

710

                        int scol = col << 2;

711

                        int sidx = IMUL(row, ws) + scol;

712

                        int v[4] = {0, 0, 0, 0};

713

#pragma unroll

714

                        for(int i = 0; i < 4 && scol < ws; ++i, ++scol)

715

716

                                int4 temp = tex1Dfetch(texDataI4, sidx + i);

717

                                v[i] = temp.x + temp.y + temp.z + temp.w;

718

719

                        d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);

void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)

724

725

        int ws = hist1->GetImgWidth(), hs = hist1->GetImgHeight();

726

        int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();

727

        int temp = (int)floor(logf(float(wd * 2/ 3)) / logf(2.0f));

728

        const int wi = min(7, max(temp , 0));

729

        hist1->BindTexture(texDataI4);

730

731

        const int BW = 1 << wi, BH =  1 << (7 - wi);

732

        dim3 grid((wd  + BW - 1)/ BW,  (hd + BH -1) / BH);

733

        dim3 block(BW, BH);

734

        ReduceHist_Kernel<<<grid, block>>>((int4*)hist2->_cuData, ws, wd, hd);

void __global__ ListGen_Kernel(int4* d_list, int width)

739

740

        int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;

741

    int4 pos = tex1Dfetch(texDataList, idx1);

742

        int idx2 = IMUL(pos.y, width) + pos.x;

743

        int4 temp = tex1Dfetch(texDataI4, idx2);

744

        int  sum1 = temp.x + temp.y;

745

        int  sum2 = sum1 + temp.z;

746

        pos.x <<= 2;

747

        if(pos.z >= sum2)

748

749

                pos.x += 3;

750

                pos.z -= sum2;

751

        }else if(pos.z >= sum1)

752

753

                pos.x += 2;

754

                pos.z -= sum1;

755

        }else if(pos.z >= temp.x)

756

757

                pos.x += 1;

758

                pos.z -= temp.x;

759

760

        d_list[idx1] = pos;

761

762

763

//input list (x, y) (x, y) ....

764

void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)

765

766

        int len = list->GetImgWidth();

767

        list->BindTexture(texDataList);

768

        hist->BindTexture(texDataI4);

769

        dim3  grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);

770

        dim3  block(LISTGEN_BLOCK_DIM);

771

        ListGen_Kernel<<<grid, block>>>((int4*) list->_cuData, hist->GetImgWidth());

772

773

774

void __global__ ComputeOrientation_Kernel(float4* d_list,

775

                                                                                  int list_len,

776

                                                                                  int width, int height,

777

                                                                                  float sigma, float sigma_step,

778

                                                                                  float gaussian_factor, float sample_factor,

779

                                                                                  int num_orientation,

780

                                                                                  int existing_keypoint,

781

                                                                                  int subpixel,

782

                                                                                  int keepsign)

783

784

        const float ten_degree_per_radius = 5.7295779513082320876798154814105;

785

        const float radius_per_ten_degrees = 1.0 / 5.7295779513082320876798154814105;

786

        int idx = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;

787

        if(idx >= list_len) return;

788

        float4 key;

789

        if(existing_keypoint)

790

791

                key = tex1Dfetch(texDataF4, idx);

792

        }else

793

794

                int4 ikey = tex1Dfetch(texDataList, idx);

795

                key.x = ikey.x + 0.5f;

796

                key.y = ikey.y + 0.5f;

797

                key.z = sigma;

798

                if(subpixel || keepsign)

799

800

                        float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x);

801

                        if(subpixel)

802

803

                                key.x += offset.y;

804

                                key.y += offset.z;

805

                                key.z *= pow(sigma_step, offset.w);

806

807

                        if(keepsign) key.z *= offset.x;

808

809

810

        if(num_orientation == 0)

811

812

                key.w = 0;

813

                d_list[idx] = key;

814

                return;

815

816

        float vote[37];

817

        float gsigma = key.z * gaussian_factor;

818

        float win = fabs(key.z) * sample_factor;

819

        float dist_threshold = win * win + 0.5;

820

        float factor = -0.5f / (gsigma * gsigma);

821

        float xmin = max(1.5f, floor(key.x - win) + 0.5f);

822

        float ymin = max(1.5f, floor(key.y - win) + 0.5f);

823

        float xmax = min(width - 1.5f, floor(key.x + win) + 0.5f);

824

        float ymax = min(height -1.5f, floor(key.y + win) + 0.5f);

825

#pragma unroll

826

        for(int i = 0; i < 36; ++i) vote[i] = 0.0f;

827

        for(float y = ymin; y <= ymax; y += 1.0f)

828

829

                for(float x = xmin; x <= xmax; x += 1.0f)

830

831

                        float dx = x - key.x;

832

                        float dy = y - key.y;

833

                        float sq_dist  = dx * dx + dy * dy;

834

                        if(sq_dist >= dist_threshold) continue;

835

                        float2 got = tex2D(texDataF2, x, y);

836

                        float weight = got.x * exp(sq_dist * factor);

837

                        float fidx = floor(got.y * ten_degree_per_radius);

838

                        int oidx = fidx;

839

                        if(oidx < 0) oidx += 36;

840

                        vote[oidx] += weight;

        //filter the vote

845

846

        const float one_third = 1.0 /3.0;

847

#pragma unroll

848

        for(int i = 0; i < 6; ++i)

849

850

                vote[36] = vote[0];

851

                float pre = vote[35];

852

#pragma unroll

853

                for(int j = 0; j < 36; ++j)

854

855

                        float temp = one_third * (pre + vote[j] + vote[j + 1]);

856

                        pre = vote[j];                        vote[j] = temp;

        vote[36] = vote[0];

861

        if(num_orientation == 1 || existing_keypoint)

862

863

                int index_max = 0;

864

                float max_vote = vote[0];

865

#pragma unroll

866

                for(int i = 1; i < 36; ++i)

867

868

                        index_max =  vote[i] > max_vote? i : index_max;

869

                        max_vote = max(max_vote, vote[i]);

870

871

                float pre = vote[index_max == 0? 35 : index_max -1];

872

                float next = vote[index_max + 1];

873

                float weight = max_vote;

874

                float off =  0.5f * FDIV(next - pre, weight + weight - next - pre);

875

                key.w = radius_per_ten_degrees * (index_max + 0.5f + off);

876

                d_list[idx] = key;

877

878

        }else

879

880

                float max_vote = vote[0];

881

#pragma unroll

882

                for(int i = 1; i < 36; ++i)                max_vote = max(max_vote, vote[i]);

883

884

                float vote_threshold = max_vote * 0.8f;

885

                float pre = vote[35];

886

                float max_rot[2], max_vot[2] = {0, 0};

887

                int  ocount = 0;

888

#pragma unroll

889

                for(int i =0; i < 36; ++i)

890

891

                        float next = vote[i + 1];

892

                        if(vote[i] > vote_threshold && vote[i] > pre && vote[i] > next)

893

894

                                float di = 0.5f * FDIV(next - pre, vote[i] + vote[i] - next - pre);

895

                                float rot = i + di + 0.5f;

896

                                float weight = vote[i];

897

///

898

                                if(weight > max_vot[1])

899

900

                                        if(weight > max_vot[0])

901

902

                                                max_vot[1] = max_vot[0];

903

                                                max_rot[1] = max_rot[0];

904

                                                max_vot[0] = weight;

905

                                                max_rot[0] = rot;

906

907

                                        else

908

909

                                                max_vot[1] = weight;

910

                                                max_rot[1] = rot;

911

912

                                        ocount ++;

913

914

915

                        pre = vote[i];

916

917

                float fr1 = max_rot[0] / 36.0f;

918

                if(fr1 < 0) fr1 += 1.0f;

919

                unsigned short us1 = ocount == 0? 65535 : ((unsigned short )floor(fr1 * 65535.0f));

920

                unsigned short us2 = 65535;

921

                if(ocount > 1)

922

923

                        float fr2 = max_rot[1] / 36.0f;

924

                        if(fr2 < 0) fr2 += 1.0f;

925

                        us2 = (unsigned short ) floor(fr2 * 65535.0f);

926

927

                unsigned int uspack = (us2 << 16) | us1;

928

                key.w = __int_as_float(uspack);

929

                d_list[idx] = key;

void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage*key,

938

                                                                   float sigma, float sigma_step, int existing_keypoint)

939

940

        int len = list->GetImgWidth();

941

        if(len <= 0) return;

942

        int width = got->GetImgWidth(), height = got->GetImgHeight();

943

        if(existing_keypoint)

944

945

                list->BindTexture(texDataF4);

946

        }else

947

948

                list->BindTexture(texDataList);

949

                if(GlobalUtil::_SubpixelLocalization) key->BindTexture(texDataF4);

950

951

        got->BindTexture2D(texDataF2);

952

953

        const int block_width = len < ORIENTATION_COMPUTE_PER_BLOCK ? 16 : ORIENTATION_COMPUTE_PER_BLOCK;

954

        dim3 grid((len + block_width -1) / block_width);

955

        dim3 block(block_width);

956

957

        ComputeOrientation_Kernel<<<grid, block>>>((float4*) list->_cuData,

958

                len, width, height, sigma, sigma_step,

959

                GlobalUtil::_OrientationGaussianFactor,

960

                GlobalUtil::_OrientationGaussianFactor * GlobalUtil::_OrientationWindowFactor,

961

                GlobalUtil::_FixedOrientation? 0 : GlobalUtil::_MaxOrientation,

962

                existing_keypoint, GlobalUtil::_SubpixelLocalization, GlobalUtil::_KeepExtremumSign);

963

964

        ProgramCU::CheckErrorCUDA("ComputeOrientation");

965

966

967

template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(float4* d_des, int num,

968

                                                                                         int width, int height, float window_factor)

969

970

        const float rpi = 4.0/ 3.14159265358979323846;

971

        int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;

972

        int fidx = idx >> 4;

973

        if(fidx >= num) return;

974

        float4 key = tex1Dfetch(texDataF4, fidx);

975

        int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;

976

        float spt = fabs(key.z * window_factor);

977

        float s, c; __sincosf(key.w, &s, &c);

978

        float anglef = key.w > 3.14159265358979323846? key.w - (2.0 * 3.14159265358979323846) : key.w ;

979

        float cspt = c * spt, sspt = s * spt;

980

        float crspt = c / spt, srspt = s / spt;

981

        float2 offsetpt, pt;

982

        float xmin, ymin, xmax, ymax, bsz;

983

        offsetpt.x = ix - 1.5f;

984

        offsetpt.y = iy - 1.5f;

985

        pt.x = cspt * offsetpt.x - sspt * offsetpt.y + key.x;

986

        pt.y = cspt * offsetpt.y + sspt * offsetpt.x + key.y;

987

        bsz =  fabs(cspt) + fabs(sspt);

988

        xmin = max(1.5f, floor(pt.x - bsz) + 0.5f);

989

        ymin = max(1.5f, floor(pt.y - bsz) + 0.5f);

990

        xmax = min(width - 1.5f, floor(pt.x + bsz) + 0.5f);

991

        ymax = min(height - 1.5f, floor(pt.y + bsz) + 0.5f);

992

        float des[9];

993

#pragma unroll

994

        for(int i =0; i < 9; ++i) des[i] = 0.0f;

995

        for(float y = ymin; y <= ymax; y += 1.0f)

996

997

                for(float x = xmin; x <= xmax; x += 1.0f)

998

999

                        float dx = x - pt.x;

1000

                        float dy = y - pt.y;

1001

                        float nx = crspt * dx + srspt * dy;

1002

                        float ny = crspt * dy - srspt * dx;

1003

                        float nxn = fabs(nx);

1004

                        float nyn = fabs(ny);

1005

                        if(nxn < 1.0f && nyn < 1.0f)

1006

1007

                                float2 cc = tex2D(texDataF2, x, y);

1008

                                float dnx = nx + offsetpt.x;

1009

                                float dny = ny + offsetpt.y;

1010

                                float ww = exp(-0.125f * (dnx * dnx + dny * dny));

1011

                                float wx = 1.0 - nxn;

1012

                                float wy = 1.0 - nyn;

1013

                                float weight = ww * wx * wy * cc.x;

1014

                                float theta = (anglef - cc.y) * rpi;

1015

                                if(theta < 0) theta += 8.0f;

1016

                                float fo = floor(theta);

1017

                                int fidx = fo;

1018

                                float weight1 = fo + 1.0f  - theta;

1019

                                float weight2 = theta - fo;

1020

                                if(DYNAMIC_INDEXING)

1021

1022

                                        des[fidx] += (weight1 * weight);

1023

                                        des[fidx + 1] += (weight2 * weight);

1024

                                        //this dynamic indexing part might be slow

1025

                                }else

1026

1027

                                        #pragma unroll

1028

                                        for(int k = 0; k < 8; ++k)

1029

1030

                                                if(k == fidx)

1031

1032

                                                        des[k] += (weight1 * weight);

1033

                                                        des[k+1] += (weight2 * weight);

        des[0] += des[8];

1041

1042

        int didx = idx << 1;

1043

        d_des[didx] = make_float4(des[0], des[1], des[2], des[3]);

1044

        d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);

template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(float4* d_des, int num,

1049

                                                                                         int width, int height, float window_factor)

1050

1051

        const float rpi = 4.0/ 3.14159265358979323846;

1052

        int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;

1053

        int fidx = idx >> 4;

1054

        if(fidx >= num) return;

1055

        float4 key = tex1Dfetch(texDataF4, fidx);

1056

        int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;

1057

    //float aspect_ratio = key.w / key.z;

1058

    //float aspect_sq = aspect_ratio * aspect_ratio;

1059

        float sptx = key.z * 0.25, spty = key.w * 0.25;

1060

        float xmin, ymin, xmax, ymax; float2 pt;

1061

        pt.x = sptx * (ix + 0.5f)  + key.x;

1062

        pt.y = spty * (iy + 0.5f)  + key.y;

1063

        xmin = max(1.5f, floor(pt.x - sptx) + 0.5f);

1064

        ymin = max(1.5f, floor(pt.y - spty) + 0.5f);

1065

        xmax = min(width - 1.5f, floor(pt.x + sptx) + 0.5f);

1066

        ymax = min(height - 1.5f, floor(pt.y + spty) + 0.5f);

1067

        float des[9];

1068

#pragma unroll

1069

        for(int i =0; i < 9; ++i) des[i] = 0.0f;

1070

        for(float y = ymin; y <= ymax; y += 1.0f)

1071

1072

                for(float x = xmin; x <= xmax; x += 1.0f)

1073

1074

                        float nx = (x - pt.x) / sptx;

1075

                        float ny = (y - pt.y) / spty;

1076

                        float nxn = fabs(nx);

1077

                        float nyn = fabs(ny);

1078

                        if(nxn < 1.0f && nyn < 1.0f)

1079

1080

                                float2 cc = tex2D(texDataF2, x, y);

1081

                                float wx = 1.0 - nxn;

1082

                                float wy = 1.0 - nyn;

1083

                                float weight =  wx * wy * cc.x;

1084

                                float theta = (- cc.y) * rpi;

1085

                                if(theta < 0) theta += 8.0f;

1086

                                float fo = floor(theta);

1087

                                int fidx = fo;

1088

                                float weight1 = fo + 1.0f  - theta;

1089

                                float weight2 = theta - fo;

1090

                                if(DYNAMIC_INDEXING)

1091

1092

                                        des[fidx] += (weight1 * weight);

1093

                                        des[fidx + 1] += (weight2 * weight);

1094

                                        //this dynamic indexing part might be slow

1095

                                }else

1096

1097

                                        #pragma unroll

1098

                                        for(int k = 0; k < 8; ++k)

1099

1100

                                                if(k == fidx)

1101

1102

                                                        des[k] += (weight1 * weight);

1103

                                                        des[k+1] += (weight2 * weight);

        des[0] += des[8];

1111

1112

        int didx = idx << 1;

1113

        d_des[didx] = make_float4(des[0], des[1], des[2], des[3]);

1114

        d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);

1115

1116

1117

void __global__ NormalizeDescriptor_Kernel(float4* d_des, int num)

1118

1119

        float4 temp[32];

1120

        int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;

1121

        if(idx >= num) return;

1122

        int sidx = idx << 5;

1123

        float norm1 = 0, norm2 = 0;

1124

#pragma unroll

1125

        for(int i = 0; i < 32; ++i)

1126

1127

                temp[i] = tex1Dfetch(texDataF4, sidx +i);

1128

                norm1 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +

1129

                                 temp[i].z * temp[i].z + temp[i].w * temp[i].w);

1130

1131

        norm1 = rsqrt(norm1);

1132

1133

#pragma unroll

1134

        for(int i = 0; i < 32; ++i)

1135

1136

                temp[i].x = min(0.2f, temp[i].x * norm1);

1137

                temp[i].y = min(0.2f, temp[i].y * norm1);

1138

                temp[i].z = min(0.2f, temp[i].z * norm1);

1139

                temp[i].w = min(0.2f, temp[i].w * norm1);

1140

                norm2 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +

1141

                                 temp[i].z * temp[i].z + temp[i].w * temp[i].w);

1142

1143

1144

        norm2 = rsqrt(norm2);

1145

#pragma unroll

1146

        for(int i = 0; i < 32; ++i)

1147

1148

                temp[i].x *= norm2;                temp[i].y *= norm2;

1149

                temp[i].z *= norm2;                temp[i].w *= norm2;

1150

                d_des[sidx + i] = temp[i];

void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage* dtex, int rect, int stream)

1155

1156

        int num = list->GetImgWidth();

1157

        int width = got->GetImgWidth();

1158

        int height = got->GetImgHeight();

1159

1160

    dtex->InitTexture(num * 128, 1, 1);

1161

        got->BindTexture2D(texDataF2);

1162

        list->BindTexture(texDataF4);

1163

        int block_width = DESCRIPTOR_COMPUTE_BLOCK_SIZE;

1164

        dim3 grid((num * 16 + block_width -1) / block_width);

1165

        dim3 block(block_width);

1166

1167

    if(rect)

1168

1169

            if(GlobalUtil::_UseDynamicIndexing)

1170

                    ComputeDescriptorRECT_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);

1171

            else

1172

                    ComputeDescriptorRECT_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);

1173

1174

    }else

1175

1176

            if(GlobalUtil::_UseDynamicIndexing)

1177

                    ComputeDescriptor_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);

1178

            else

1179

                    ComputeDescriptor_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);

1180

1181

        if(GlobalUtil::_NormalizedSIFT)

1182

1183

                dtex->BindTexture(texDataF4);

1184

                const int block_width = DESCRIPTOR_NORMALIZ_PER_BLOCK;

1185

                dim3 grid((num + block_width -1) / block_width);

1186

                dim3 block(block_width);

1187

                NormalizeDescriptor_Kernel<<<grid, block>>>((float4*) dtex->_cuData, num);

1188

1189

        CheckErrorCUDA("ComputeDescriptor");

1190

1191

1192

//////////////////////////////////////////////////////

1193

void ProgramCU::FinishCUDA()

1194

1195

        cudaThreadSynchronize();

1196

1197

1198

int ProgramCU::CheckErrorCUDA(const char* location)

1199

1200

        cudaError_t e = cudaGetLastError();

1201

        if(e)

1202

1203

        if(location) fprintf(stderr, "%s:\t",  location);

1204

                fprintf(stderr, "%s\n",  cudaGetErrorString(e));

1205

                //assert(0);

1206

        return 1;

1207

        }else

1208

1209

        return 0;

void __global__ ConvertDOG_Kernel(float* d_result, int width, int height)

1214

1215

        int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;

1216

        int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;

1217

        if(col < width && row < height)

1218

1219

                int index = row * width  + col;

1220

                float v = tex1Dfetch(texData, index);

1221

                d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?

1222

                        0.5 : saturate(0.5+20.0*v);

1223

1224

1225

///

1226

void ProgramCU::DisplayConvertDOG(CuTexImage* dog, CuTexImage* out)

1227

1228

        if(out->_cuData == NULL) return;

1229

        int width = dog->GetImgWidth(), height = dog ->GetImgHeight();

1230

        dog->BindTexture(texData);

1231

        dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);

1232

        dim3 block(BLOCK_DIM, BLOCK_DIM);

1233

        ConvertDOG_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);

1234

        ProgramCU::CheckErrorCUDA("DisplayConvertDOG");

1235

1236

1237

void __global__ ConvertGRD_Kernel(float* d_result, int width, int height)

1238

1239

        int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;

1240

        int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;

1241

        if(col < width && row < height)

1242

1243

                int index = row * width  + col;

1244

                float v = tex1Dfetch(texData, index << 1);

1245

                d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?

1246

                                0 : saturate(5 * v);

void ProgramCU::DisplayConvertGRD(CuTexImage* got, CuTexImage* out)

1253

1254

        if(out->_cuData == NULL) return;

1255

        int width = got->GetImgWidth(), height = got ->GetImgHeight();

1256

        got->BindTexture(texData);

1257

        dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);

1258

        dim3 block(BLOCK_DIM, BLOCK_DIM);

1259

        ConvertGRD_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);

1260

        ProgramCU::CheckErrorCUDA("DisplayConvertGRD");

1261

1262

1263

void __global__ ConvertKEY_Kernel(float4* d_result, int width, int height)

1264

1265

1266

        int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;

1267

        int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;

1268

        if(col < width && row < height)

1269

1270

                int index = row * width + col;

1271

                float4 keyv = tex1Dfetch(texDataF4, index);

1272

                int is_key = (keyv.x == 1.0f || keyv.x == -1.0f);

1273

                int inside = col > 0 && row > 0 && row < height -1 && col < width - 1;

1274

                float v = inside? saturate(0.5 + 20 * tex1Dfetch(texData, index)) : 0.5;

1275

                d_result[index] = is_key && inside ?

1276

                        (keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)):

1277

                        make_float4(v, v, v, 1.0f) ;

1278

1279

1280

void ProgramCU::DisplayConvertKEY(CuTexImage* key, CuTexImage* dog, CuTexImage* out)

1281

1282

        if(out->_cuData == NULL) return;

1283

        int width = key->GetImgWidth(), height = key ->GetImgHeight();

1284

        dog->BindTexture(texData);

1285

        key->BindTexture(texDataF4);

1286

        dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);

1287

        dim3 block(BLOCK_DIM, BLOCK_DIM);

1288

        ConvertKEY_Kernel<<<grid, block>>>((float4*) out->_cuData, width, height);

void __global__ DisplayKeyPoint_Kernel(float4 * d_result, int num)

1293

1294

        int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;

1295

        if(idx >= num) return;

1296

        float4 v = tex1Dfetch(texDataF4, idx);

1297

        d_result[idx] = make_float4(v.x, v.y, 0, 1.0f);

1298

1299

1300

void ProgramCU::DisplayKeyPoint(CuTexImage* ftex, CuTexImage* out)

1301

1302

        int num = ftex->GetImgWidth();

1303

        int block_width = 64;

1304

        dim3 grid((num + block_width -1) /block_width);

1305

        dim3 block(block_width);

1306

        ftex->BindTexture(texDataF4);

1307

        DisplayKeyPoint_Kernel<<<grid, block>>>((float4*) out->_cuData, num);

1308

        ProgramCU::CheckErrorCUDA("DisplayKeyPoint");

1309

1310

1311

void __global__ DisplayKeyBox_Kernel(float4* d_result, int num)

1312

1313

        int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;

1314

        if(idx >= num) return;

1315

        int  kidx = idx / 10, vidx = idx - IMUL(kidx , 10);

1316

        float4 v = tex1Dfetch(texDataF4, kidx);

1317

        float sz = fabs(v.z * 3.0f);

1318

        ///////////////////////

1319

        float s, c;        __sincosf(v.w, &s, &c);

1320

        ///////////////////////

1321

        float dx = vidx == 0? 0 : ((vidx <= 4 || vidx >= 9)? sz : -sz);

1322

        float dy = vidx <= 1? 0 : ((vidx <= 2 || vidx >= 7)? -sz : sz);

1323

        float4 pos;

1324

        pos.x = v.x + c * dx - s * dy;

1325

        pos.y = v.y + c * dy + s * dx;

1326

        pos.z = 0;        pos.w = 1.0f;

1327

        d_result[idx]  = pos;

1328

1329

1330

void ProgramCU::DisplayKeyBox(CuTexImage* ftex, CuTexImage* out)

1331

1332

        int len = ftex->GetImgWidth();

1333

        int block_width = 32;

1334

        dim3 grid((len * 10 + block_width -1) / block_width);

1335

        dim3 block(block_width);

1336

        ftex->BindTexture(texDataF4);

1337

        DisplayKeyBox_Kernel<<<grid, block>>>((float4*) out->_cuData, len * 10);

1338

1339

///////////////////////////////////////////////////////////////////

1340

inline void CuTexImage:: BindTexture(textureReference& texRef)

1341

1342

         cudaBindTexture(NULL, &texRef, _cuData, &texRef.channelDesc, _numBytes);

1343

1344

1345

inline void CuTexImage::BindTexture2D(textureReference& texRef)

1346

1347

#if defined(SIFTGPU_ENABLE_LINEAR_TEX2D)

1348

        cudaBindTexture2D(0, &texRef, _cuData, &texRef.channelDesc, _imgWidth, _imgHeight, _imgWidth* _numChannel* sizeof(float));

1349

#else

1350

        cudaChannelFormatDesc desc;

1351

        cudaGetChannelDesc(&desc, _cuData2D);

1352

        cudaBindTextureToArray(&texRef, _cuData2D, &desc);

1353

#endif

1354

1355

1356

int ProgramCU::CheckCudaDevice(int device)

1357

1358

    int count = 0, device_used;

1359

    if(cudaGetDeviceCount(&count) || count <= 0)

1360

1361

        ProgramCU::CheckErrorCUDA("CheckCudaDevice");

1362

        return 0;

1363

    }else if(count == 1)

1364

1365

        cudaDeviceProp deviceProp;

1366

        cudaGetDeviceProperties(&deviceProp, 0);

1367

        if (deviceProp.major == 9999 && deviceProp.minor == 9999)

1368

1369

            fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");

1370

            return 0;

1371

1372

1373

    if(device >0 && device < count)

1374

1375

        cudaSetDevice(device);

1376

        CheckErrorCUDA("cudaSetDevice\n");

1377

1378

    cudaGetDevice(&device_used);

1379

    if(device != device_used)

1380

        fprintf(stderr,  "\nERROR:   Cannot set device to %d\n"

1381

        "\nWARNING: Use # %d device instead (out of %d)\n", device, device_used, count);

1382

    return 1;

1383

1384

1385

////////////////////////////////////////////////////////////////////////////////////////

1386

// siftmatch funtions

1387

//////////////////////////////////////////////////////////////////////////////////////////

1388

1389

#define MULT_TBLOCK_DIMX 128

1390

#define MULT_TBLOCK_DIMY 1

1391

#define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX)

1392

#define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY)

1393

1394

1395

texture<uint4, 1, cudaReadModeElementType> texDes1;

1396

texture<uint4, 1, cudaReadModeElementType> texDes2;

1397

1398

void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int3* d_temp)

1399

1400

        int idx01 = (blockIdx.y  * MULT_BLOCK_DIMY),  idx02 = (blockIdx.x  * MULT_BLOCK_DIMX);

1401

1402

        int idx1 = idx01 + threadIdx.y, idx2 = idx02 + threadIdx.x;

1403

        __shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];

1404

        int read_idx1 = idx01 * 8 +  threadIdx.x, read_idx2 = idx2 * 8;

1405

        int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;

1406

        int cache_idx1 = IMUL(row4, 17) + (col4 << 2);

1407

1408

        ///////////////////////////////////////////////////////////////

1409

        //Load feature descriptors

1410

        ///////////////////////////////////////////////////////////////

1411

#if MULT_BLOCK_DIMY == 16

1412

        uint4 v = tex1Dfetch(texDes1, read_idx1);

1413

        data1[cache_idx1]   = v.x;        data1[cache_idx1+1] = v.y;

1414

        data1[cache_idx1+2] = v.z;        data1[cache_idx1+3] = v.w;

1415

#elif MULT_BLOCK_DIMY == 8

1416

        if(threadIdx.x < 64)

1417

1418

                uint4 v = tex1Dfetch(texDes1, read_idx1);

1419

                data1[cache_idx1]   = v.x;                data1[cache_idx1+1] = v.y;

1420

                data1[cache_idx1+2] = v.z;                data1[cache_idx1+3] = v.w;

1421

1422

#else

1423

#error

1424

#endif

1425

        __syncthreads();

1426

1427

///

1428

        if(idx2 >= num2) return;

1429

        ///////////////////////////////////////////////////////////////////////////

1430

        //compare descriptors

1431

1432

        int results[MULT_BLOCK_DIMY];

1433

#pragma unroll

1434

        for(int i = 0; i < MULT_BLOCK_DIMY; ++i) results[i] = 0;

1435

1436

#pragma unroll

1437

        for(int i = 0; i < 8; ++i)

1438

1439

                uint4 v = tex1Dfetch(texDes2, read_idx2 + i);

1440

                unsigned char* p2 = (unsigned char*)(&v);

1441

#pragma unroll

1442

                for(int k = 0; k < MULT_BLOCK_DIMY; ++k)

1443

1444

                        unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i *  4 + (i/4));

1445

                        results[k] +=          ( IMUL(p1[0], p2[0])        + IMUL(p1[1], p2[1])

1446

                                                         + IMUL(p1[2], p2[2])          + IMUL(p1[3], p2[3])

1447

                                                         + IMUL(p1[4], p2[4])          + IMUL(p1[5], p2[5])

1448

                                                         + IMUL(p1[6], p2[6])          + IMUL(p1[7], p2[7])

1449

                                                         + IMUL(p1[8], p2[8])          + IMUL(p1[9], p2[9])

1450

                                                         + IMUL(p1[10], p2[10])        + IMUL(p1[11], p2[11])

1451

                                                         + IMUL(p1[12], p2[12])        + IMUL(p1[13], p2[13])

1452

                                                         + IMUL(p1[14], p2[14])        + IMUL(p1[15], p2[15]));

        int dst_idx = IMUL(idx1, num2)  + idx2;

1457

        if(d_temp)

1458

1459

                int3 cmp_result = make_int3(0, -1, 0);

1460

1461

#pragma unroll

1462

                for(int i = 0; i < MULT_BLOCK_DIMY; ++i)

1463

1464

                        if(idx1 + i < num1)

1465

1466

                                cmp_result = results[i] > cmp_result.x?

1467

                                make_int3(results[i], idx1 + i, cmp_result.x) :

1468

                                make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));

1469

                                d_result[dst_idx + IMUL(i, num2)] = results[i];

1470

1471

1472

                d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result;

1473

        }else

1474

1475

#pragma unroll

1476

                for(int i = 0; i < MULT_BLOCK_DIMY; ++i)

1477

1478

                        if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = results[i];

void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImage* texDot, CuTexImage* texCRT)

1486

1487

        int num1 = des1->GetImgWidth() / 8;

1488

        int num2 = des2->GetImgWidth() / 8;

1489

        dim3 grid(        (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,

1490

                (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);

1491

        dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);

1492

        texDot->InitTexture( num2,num1);

1493

        if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32);

1494

        des1->BindTexture(texDes1);

1495

        des2->BindTexture(texDes2);

1496

1497

        MultiplyDescriptor_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,

1498

                                                                                                (texCRT? (int3*)texCRT->_cuData : NULL));

1499

        ProgramCU::CheckErrorCUDA("MultiplyDescriptor");

1500

1501

1502

texture<float, 1, cudaReadModeElementType> texLoc1;

1503

texture<float2, 1, cudaReadModeElementType> texLoc2;

1504

struct Matrix33{float mat[3][3];};

void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, int3* d_temp,

1509

                                                                                   Matrix33 H, float hdistmax, Matrix33 F, float fdistmax)

1510

1511

        int idx01 = (blockIdx.y  * MULT_BLOCK_DIMY);

1512

        int idx02 = (blockIdx.x  * MULT_BLOCK_DIMX);

1513

1514

        int idx1 = idx01 + threadIdx.y;

1515

        int idx2 = idx02 + threadIdx.x;

1516

        __shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];

1517

        __shared__ float loc1[MULT_BLOCK_DIMY * 2];

1518

        int read_idx1 = idx01 * 8 +  threadIdx.x ;

1519

        int read_idx2 = idx2 * 8;

1520

        int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;

1521

        int cache_idx1 = IMUL(row4, 17) + (col4 << 2);

1522

#if MULT_BLOCK_DIMY == 16

1523

        uint4 v = tex1Dfetch(texDes1, read_idx1);

1524

        data1[cache_idx1]   = v.x;

1525

        data1[cache_idx1+1] = v.y;

1526

        data1[cache_idx1+2] = v.z;

1527

        data1[cache_idx1+3] = v.w;

1528

#elif MULT_BLOCK_DIMY == 8

1529

        if(threadIdx.x < 64)

1530

1531

                uint4 v = tex1Dfetch(texDes1, read_idx1);

1532

                data1[cache_idx1]   = v.x;

1533

                data1[cache_idx1+1] = v.y;

1534

                data1[cache_idx1+2] = v.z;

1535

                data1[cache_idx1+3] = v.w;

1536

1537

#else

1538

#error

1539

#endif

1540

        __syncthreads();

1541

        if(threadIdx.x < MULT_BLOCK_DIMY * 2)

1542

1543

                loc1[threadIdx.x] = tex1Dfetch(texLoc1, 2 * idx01 + threadIdx.x);

1544

1545

        __syncthreads();

1546

        if(idx2 >= num2) return;

1547

        int results[MULT_BLOCK_DIMY];

1548

        /////////////////////////////////////////////////////////////////////////////////////////////

1549

        //geometric verification

1550

        /////////////////////////////////////////////////////////////////////////////////////////////

1551

        int good_count = 0;

1552

        float2 loc2 = tex1Dfetch(texLoc2, idx2);

1553

#pragma unroll

1554

        for(int i = 0; i < MULT_BLOCK_DIMY; ++i)

1555

1556

1557

                if(idx1 + i < num1)

1558

1559

                        float* loci = loc1 + i * 2;

1560

                        float locx = loci[0], locy = loci[1];

1561

                        //homography

1562

                        float x[3], diff[2];

1563

                        x[0] = H.mat[0][0] * locx + H.mat[0][1] * locy + H.mat[0][2];

1564

                        x[1] = H.mat[1][0] * locx + H.mat[1][1] * locy + H.mat[1][2];

1565

                        x[2] = H.mat[2][0] * locx + H.mat[2][1] * locy + H.mat[2][2];

1566

                        diff[0] = fabs(FDIV(x[0], x[2]) - loc2.x);

1567

                        diff[1] = fabs(FDIV(x[1], x[2]) - loc2.y);

1568

                        if(diff[0] < hdistmax && diff[1] < hdistmax)

1569

1570

                                //check fundamental matrix

1571

                                float fx1[3], ftx2[3], x2fx1, se;

1572

                                fx1[0] = F.mat[0][0] * locx + F.mat[0][1] * locy + F.mat[0][2];

1573

                                fx1[1] = F.mat[1][0] * locx + F.mat[1][1] * locy + F.mat[1][2];

1574

                                //fx1[2] = F.mat[2][0] * locx + F.mat[2][1] * locy + F.mat[2][2];

1575

1576

                                ftx2[0] = F.mat[0][0] * locx + F.mat[1][0] * locy + F.mat[2][0];

1577

                                ftx2[1] = F.mat[0][1] * locx + F.mat[1][1] * locy + F.mat[2][1];

1578

                                ftx2[2] = F.mat[0][2] * locx + F.mat[1][2] * locy + F.mat[2][2];

1579

1580

                                x2fx1 = locx * ftx2[0]  + locy * ftx2[1] + ftx2[2];

1581

                                se = FDIV(x2fx1 * x2fx1, fx1[0] * fx1[0] + fx1[1] * fx1[1] + ftx2[0] * ftx2[0] + ftx2[1] * ftx2[1]);

1582

                                results[i] = se < fdistmax? 0: -262144;

1583

                        }else

1584

1585

                                results[i] = -262144;

1586

1587

                }else

1588

1589

                        results[i] = -262144;

1590

1591

                good_count += (results[i] >=0);

1592

1593

        /////////////////////////////////////////////////////////////////////////////////////////////

1594

        ///compare feature descriptors anyway

1595

        /////////////////////////////////////////////////////////////////////////////////////////////

1596

        if(good_count > 0)

1597

1598

#pragma unroll

1599

                for(int i = 0; i < 8; ++i)

1600

1601

                        uint4 v = tex1Dfetch(texDes2, read_idx2 + i);

1602

                        unsigned char* p2 = (unsigned char*)(&v);

1603

#pragma unroll

1604

                        for(int k = 0; k < MULT_BLOCK_DIMY; ++k)

1605

1606

                                unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i *  4 + (i/4));

1607

                                results[k] +=          ( IMUL(p1[0], p2[0])        + IMUL(p1[1], p2[1])

1608

                                                                 + IMUL(p1[2], p2[2])          + IMUL(p1[3], p2[3])

1609

                                                                 + IMUL(p1[4], p2[4])          + IMUL(p1[5], p2[5])

1610

                                                                 + IMUL(p1[6], p2[6])          + IMUL(p1[7], p2[7])

1611

                                                                 + IMUL(p1[8], p2[8])          + IMUL(p1[9], p2[9])

1612

                                                                 + IMUL(p1[10], p2[10])        + IMUL(p1[11], p2[11])

1613

                                                                 + IMUL(p1[12], p2[12])        + IMUL(p1[13], p2[13])

1614

                                                                 + IMUL(p1[14], p2[14])        + IMUL(p1[15], p2[15]));

        int dst_idx = IMUL(idx1, num2)  + idx2;

1619

        if(d_temp)

1620

1621

                int3 cmp_result = make_int3(0, -1, 0);

1622

#pragma unroll

1623

                for(int i= 0; i < MULT_BLOCK_DIMY; ++i)

1624

1625

                        if(idx1 + i < num1)

1626

1627

                                cmp_result = results[i] > cmp_result.x?

1628

                                make_int3(results[i], idx1 + i, cmp_result.x) :

1629

                                make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));

1630

                                d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);

1631

                        }else

1632

1633

                                break;

1634

1635

1636

                d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result;

1637

        }else

1638

1639

#pragma unroll

1640

                for(int i = 0; i < MULT_BLOCK_DIMY; ++i)

1641

1642

                        if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);

1643

                        else break;

void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,

1651

                CuTexImage* loc1, CuTexImage* loc2, CuTexImage* texDot, CuTexImage* texCRT,

1652

                float H[3][3], float hdistmax, float F[3][3], float fdistmax)

1653

1654

        int num1 = des1->GetImgWidth() / 8;

1655

        int num2 = des2->GetImgWidth() / 8;

1656

        Matrix33 MatF, MatH;

1657

        //copy the matrix

1658

        memcpy(MatF.mat, F, 9 * sizeof(float));

1659

        memcpy(MatH.mat, H, 9 * sizeof(float));

1660

        //thread blocks

1661

        dim3 grid(        (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,

1662

                (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);

1663

        dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);

1664

        //intermediate results

1665

        texDot->InitTexture( num2,num1);

1666

        if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3);

1667

        loc1->BindTexture(texLoc1);

1668

        loc2->BindTexture(texLoc2);

1669

        des1->BindTexture(texDes1);

1670

        des2->BindTexture(texDes2);

1671

        MultiplyDescriptorG_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,

1672

                                                                                                (texCRT? (int3*)texCRT->_cuData : NULL),

1673

                                                                                                MatH, hdistmax, MatF, fdistmax);

texture<int,  1, cudaReadModeElementType> texDOT;

1678

1679

#define ROWMATCH_BLOCK_WIDTH 32

1680

#define ROWMATCH_BLOCK_HEIGHT 1

1681

1682

void __global__  RowMatch_Kernel(int*d_dot, int* d_result, int num2, float distmax, float ratiomax)

1683

1684

#if ROWMATCH_BLOCK_HEIGHT == 1

1685

        __shared__ int dotmax[ROWMATCH_BLOCK_WIDTH];

1686

        __shared__ int dotnxt[ROWMATCH_BLOCK_WIDTH];

1687

        __shared__ int dotidx[ROWMATCH_BLOCK_WIDTH];

1688

        int        row = blockIdx.y;

1689

#else

1690

        __shared__ int x_dotmax[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];

1691

        __shared__ int x_dotnxt[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];

1692

        __shared__ int x_dotidx[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];

1693

        int*        dotmax = x_dotmax[threadIdx.y];

1694

        int*        dotnxt = x_dotnxt[threadIdx.y];

1695

        int*        dotidx = x_dotidx[threadIdx.y];

1696

        int row = IMUL(blockIdx.y, ROWMATCH_BLOCK_HEIGHT) + threadIdx.y;

1697

#endif

1698

1699

        int base_address = IMUL(row , num2);

1700

        int t_dotmax = 0, t_dotnxt = 0, t_dotidx = -1;

1701

        for(int i = 0; i < num2; i += ROWMATCH_BLOCK_WIDTH)

1702

1703

                if(threadIdx.x + i < num2)

1704

1705

                        int v = tex1Dfetch(texDOT, base_address + threadIdx.x + i);//d_dot[base_address + threadIdx.x + i];//

1706

                        bool test = v > t_dotmax;

1707

                        t_dotnxt = test? t_dotmax : max(t_dotnxt, v);

1708

                        t_dotidx = test? (threadIdx.x + i) : t_dotidx;

1709

                        t_dotmax = test? v: t_dotmax;

1710

1711

                __syncthreads();

1712

1713

        dotmax[threadIdx.x] = t_dotmax;

1714

        dotnxt[threadIdx.x] = t_dotnxt;

1715

        dotidx[threadIdx.x] = t_dotidx;

1716

        __syncthreads();

1717

1718

#pragma unroll

1719

        for(int step = ROWMATCH_BLOCK_WIDTH/2; step >0; step /= 2)

1720

1721

                if(threadIdx.x < step)

1722

1723

                        int v1 = dotmax[threadIdx.x], v2 = dotmax[threadIdx.x + step];

1724

                        bool test =  v2 > v1;

1725

                        dotnxt[threadIdx.x] = test? max(v1, dotnxt[threadIdx.x + step]) :max(dotnxt[threadIdx.x], v2);

1726

                        dotidx[threadIdx.x] = test? dotidx[threadIdx.x + step] : dotidx[threadIdx.x];

1727

                        dotmax[threadIdx.x] = test? v2 : v1;

1728

1729

                __syncthreads();

1730

1731

        if(threadIdx.x == 0)

1732

1733

                float dist =  acos(min(dotmax[0] * 0.000003814697265625f, 1.0));

1734

                float distn = acos(min(dotnxt[0] * 0.000003814697265625f, 1.0));

1735

                //float ratio = dist / distn;

1736

                d_result[row] = (dist < distmax) && (dist < distn * ratiomax) ? dotidx[0] : -1;//?  : -1;

void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float distmax, float ratiomax)

1743

1744

        int num1 = texDot->GetImgHeight();

1745

        int num2 = texDot->GetImgWidth();

1746

        dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);

1747

        dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT);

1748

        texDot->BindTexture(texDOT);

1749

        RowMatch_Kernel<<<grid, block>>>((int*)texDot->_cuData,

1750

                (int*)texMatch->_cuData, num2, distmax, ratiomax);

1751

1752

1753

#define COLMATCH_BLOCK_WIDTH 32

1754

1755

//texture<int3,  1, cudaReadModeElementType> texCT;

1756

1757

void __global__  ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax)

1758

1759

        int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;

1760

        if(col >= num2) return;

1761

        int3 result = d_crt[col];//tex1Dfetch(texCT, col);

1762

        int read_idx = col + num2;

1763

        for(int i = 1; i < height; ++i, read_idx += num2)

1764

1765

                int3 temp = d_crt[read_idx];//tex1Dfetch(texCT, read_idx);

1766

                result = result.x < temp.x?

1767

                        make_int3(temp.x, temp.y, max(result.x, temp.z)) :

1768

                        make_int3(result.x, result.y, max(result.z, temp.x));

1769

1770

1771

        float dist =  acos(min(result.x * 0.000003814697265625f, 1.0));

1772

        float distn = acos(min(result.z * 0.000003814697265625f, 1.0));

1773

                //float ratio = dist / distn;

1774

        d_result[col] = (dist < distmax) && (dist < distn * ratiomax) ? result.y : -1;//?  : -1;

void ProgramCU::GetColMatch(CuTexImage* texCRT, CuTexImage* texMatch, float distmax, float ratiomax)

1779

1780

        int height = texCRT->GetImgHeight();

1781

        int num2 = texCRT->GetImgWidth();

1782

        //texCRT->BindTexture(texCT);

1783

    dim3 grid((num2 + COLMATCH_BLOCK_WIDTH -1) / COLMATCH_BLOCK_WIDTH);

1784

    dim3 block(COLMATCH_BLOCK_WIDTH);

1785

        ColMatch_Kernel<<<grid, block>>>((int3*)texCRT->_cuData, (int*) texMatch->_cuData, height, num2, distmax, ratiomax);

1786

1787

1788

#endif

Project

General

Profile

Quadrotor

root / rgbdslam / external / siftgpu / src / SiftGPU / ProgramCU.cu @ 9240aaa3