侠客初出江湖分享 http://blog.sciencenet.cn/u/luliqiang

博文

创建你的code blcok, 以后复制粘贴写程序--cuda初级版

已有 11004 次阅读 2011-9-8 22:38 |系统分类:科研笔记

//----2011-08-18--lqlu@home.ipe.ac.cn---20:38:33--
//copy array of structures-
const int N = 3;
const int L = 2;
struct Foo
{
  int number;
  float * bp;
};
  Foo* dFoo;            //device
  Foo* hFoo = new struct Foo[N];    //host
  Foo* tFoo = new struct Foo[N];  //temp
  for (int i = 0; i<N; ++i)    //init data
  {
    hFoo[i].number = L;
    hFoo[i].bp = (float*)malloc(sizeof(float)*L);
  }
  for (size_t i = 0; i != N; ++i)        //copy each strcture's array element
  {
    float * d_bp;
    cudaMalloc((void**)&d_bp, sizeof(float)*L);
    cudaMemcpy(d_bp, hFoo[i].bp,sizeof(float)*L, cudaMemcpyHostToDevice);
    tFoo[i].bp = d_bp;    // this is already a device pointer!
    tFoo[i].number = hFoo[i].number;
  }
  cudaMalloc((void**) &dFoo, sizeof(Foo) * N);    //device mem. alloc
  cudaMemcpy(dFoo, tFoo, sizeof(Foo)*N, cudaMemcpyHostToDevice);
  MyFunc<<<1,N>>>(dFoo);
 //copy back
 cudaMemcpy(tFoo , dFoo, sizeof(Foo)*N,cudaMemcpyDeviceToHost);
  for (size_t i = 0; i != N; ++i)
    cudaMemcpy(hFoo[i].bp , tFoo[i].bp, sizeof(float)*L,cudaMemcpyDeviceToHost);
//========================================================================================================



//maping 2d to 1d
__global__ kernel_(double, int,)
{
int thx = threadIdx.x + blockIdx.x * blockDim.x;
int thy = threadIdx.y + blockIdx.y * blockDim.y;
int offset = thx + thy * blockDim.x * gridDim.x;
if (offset>=length) return;
}

int  ttx = ;//total threads along x
int  tty = ;//total thread along y
dim3 threads(32,32)//1024 threads per block
int  bnx = (int)ceil(ttx/(double)threads.x);
int  bny = (int)ceil(tty/(double)threads.y);
//printf("tnx=%d,tny=%d,totalThreadsp/block=%dnbnx=%d,bny=%d,totalBlocks=%dn",threads.x,threads.y,threads.x*threads.y,bnx,bny,bnx*bny);
dim3 blocks(bnx,bny)
kernel_<<<blocks,threads>>>(,)
cudaThreadSynchronize();
//CUT_CHECK_ERROR("!!!kernel_ failed!!!n");
"//---------------------------------------------------------------------------------------------"    
//002:timing    
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
kernel<<<grid,threads>>> ( d_odata, d_idata, size_x, size_y, NUM_REPS);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("kernel spent:%ds",stop-start)
"//-------------------use--shared--mem------------------------------------------------------------------------"    
__global__ void coalescedMultiply(float *a, float* b, float *c, int N)
{
    __shared__ float aTile[TILE_DIM][TILE_DIM];
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    aTile[threadIdx.y][threadIdx.x] = a[row*TILE_DIM+threadIdx.x];
    for (int i = 0; i < TILE_DIM; i++)
        sum += aTile[threadIdx.y][i]* b[i*N+col];
    c[row*N+col] = sum;
}
"//---------------------------------------------------------------------------------------------"    
__global__ void sharedABMultiply(float *a, float* b, float *c, int N)
{
    __shared__ float aTile[TILE_DIM][TILE_DIM], bTile[TILE_DIM][TILE_DIM];
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    aTile[threadIdx.y][threadIdx.x] = a[row*TILE_DIM+threadIdx.x];
    bTile[threadIdx.y][threadIdx.x] = b[threadIdx.y*N+col];
    __syncthreads();
    for (int i = 0; i < TILE_DIM; i++)
             sum += aTile[threadIdx.y][i]* bTile[i][threadIdx.x];
    c[row*N+col] = sum;
}
"//----------------pitch---mem--------------------------------------------------------------------------"    
    float *cpu_A, *gpu_A;
        int height_A = M;//矩阵的高度(行数)
    int width_A = DIM; //矩阵的宽度(列数)
        int pitch_A_num;
        size_t pitch_A, mem_size_A, mem_size_row_A;   //GPU数组的pitch
        mem_size_row_A = sizeof(float)*width_A;
        mem_size_A = mem_size_row_A * height_A;
    cpu_A = (float*)malloc(mem_size_A);    
         cudaMallocPitch((void**) &gpu_A, &pitch_A, mem_size_row_A, height_A));
        pitch_A_num = pitch_A/sizeof(float);
        cutilSafeCall( cudaMemcpy2D(gpu_A, pitch_A, cpu_A, mem_size_row_A, mem_size_row_A, height_A, cudaMemcpyHostToDevice));
    //cudaMemcpy2D (void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind)
  cutilSafeCall( cudaMemcpy2D(cpu_C, mem_size_row_C, gpu_C, pitch_C*sizeof(float), mem_size_row_C, height_C, cudaMemcpyDeviceToHost));        
     
      int row = blockIdx.y * blockDim.y + threadIdx.y;
      int col = blockIdx.x * blockDim.x + threadIdx.x;
            if(row>=M||col>=N) return;
            float sum = 0.0f;
            for(int i=0; i<DIM; i++){
                sum += a[row*pitch_A+i]*b[i*pitch_B+col];
            }
            c[row*pitch_C+col] = sum;




https://blog.sciencenet.cn/blog-325165-484290.html

上一篇:计算流体力学小程序源代码(FTCS\BTCS\CNCS)、RealTime version
下一篇:开了一半的题:颗粒物质相关
收藏 IP: 159.226.216.*| 热度|

0

该博文允许注册用户评论 请点击登录 评论 (0 个评论)

数据加载中...

Archiver|手机版|科学网 ( 京ICP备07017567号-12 )

GMT+8, 2024-7-17 20:17

Powered by ScienceNet.cn

Copyright © 2007- 中国科学报社

返回顶部