Wednesday, 2 April 2008

First Hello World program from CUDA

HELLO WORLD (CUDA)


Step 1:

cp -r template HelloWorld

NOTE:- CUDA documentation is under common\ cutil_readme.txt

step 2:

compile using make (for testing)

Step 3:

Change the template with (HelloWorld) in make file

Open the HelloWorld.cu and change the include template_kernel.cu to

// includes, kernels
#include <HelloWorld_kernel.cu>

NOTES:-

CUDA_DEVICE_INIT all the cuda initialization's under here

CUDA_SAFE_CALL tells how to call ( CUDA Routines)

Limitation is up to 32 threads


(HelloWorld_kernel.cu) Kernel is going to execute on GPU.

(HelloWorld.cu) and other will run on CPU.


Step 4:

Compile using make (should come up without any error)

NOTE : - in this default example we will use the global memory area

Using cuda malloc it makes sure the data transfer is optimized


Step 5:

remove the reference code from the HelloWorld.cu file

NOTE:

* h_odata - CPU data
* d_odata - GPU data

------------------------------------------------
Final Working program
------------------------------------------------

==========================
HelloWorld_Kernel
==========================

/*
* Copyright 2008, Karen Hains, UWA (University of Western Australia).
* All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* WE MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND.
*/

#ifndef _HELLOWORLD_KERNEL_H_
#define _HELLOWORLD_KERNEL_H_

#include <stdio.h>


///////////////////////////////////////////////////////////
// Simple Hello World kernel
// @param gpu_odata output data in global memory
///////////////////////////////////////////////////////////
__global__ void HelloWorld_kernel(int size, char *gpu_odata)
{
int i,k,x,n,last_num;
int idx,not_done;

// Access thread id and block id
const unsigned int thread_idx = threadIdx.x;
const unsigned int block_idx = blockIdx.x;

// Write data to global memory
idx = 0;
gpu_odata[thread_idx*size + idx++] = 'H';
gpu_odata[thread_idx*size + idx++] = 'e';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'W';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'd';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'F';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'm';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'T';
gpu_odata[thread_idx*size + idx++] = 'h';
gpu_odata[thread_idx*size + idx++] = 'r';
gpu_odata[thread_idx*size + idx++] = 'e';
gpu_odata[thread_idx*size + idx++] = 'a';
gpu_odata[thread_idx*size + idx++] = 'd';
gpu_odata[thread_idx*size + idx++] = ' ';

// Convert thread id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = thread_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}

// Parse out the thread index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = thread_idx/k-last_num;
gpu_odata[thread_idx*size + idx++] = '0' + x;
last_num = (thread_idx/k)*10;
k = k/10;
}

gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'i';
gpu_odata[thread_idx*size + idx++] = 'n';
gpu_odata[thread_idx*size + idx++] = ' ';
gpu_odata[thread_idx*size + idx++] = 'B';
gpu_odata[thread_idx*size + idx++] = 'l';
gpu_odata[thread_idx*size + idx++] = 'o';
gpu_odata[thread_idx*size + idx++] = 'c';
gpu_odata[thread_idx*size + idx++] = 'k';
gpu_odata[thread_idx*size + idx++] = ' ';

// Convert block id to chars
// Determine number of places in thread idx
not_done = 1;
k = 10;
n = 1;
while(not_done == 1) {
x = block_idx/k;
if (x>0) {
k = k*10;
n +=1;
}
else
not_done = 0;
}

// Parse out the block index and convert to chars
k = k/10;
last_num = 0;
for(i=n;i>0;i--) {
x = block_idx/k-last_num;
gpu_odata[thread_idx*size + idx++] = '0' + x;
last_num = (block_idx/k)*10;
k = k/10;
}

// Fill out rest of string
for(i=idx;i<size;i++)
gpu_odata[thread_idx*size + idx++] = ' ';
}

#endif // #ifndef _HELLOWORLD_KERNEL_H_


==========================

HelloWorld.cu

==========================

/*

* Copyright 2008, Karen Hains, UWA . All rights reserved.

*

* NOTICE TO USER:

*

* This source code is subject to NVIDIA ownership rights under U.S. and

* international Copyright laws. Users and possessors of this source code

* are hereby granted a nonexclusive, royalty-free license to use this code

* in individual and commercial software.

*

* WE MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE

* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

* IMPLIED WARRANTY OF ANY KIND.

*/

/* HellowWorld Project
* This project demonstrates the basics on how to setup
* an example GPU Copmuting application.
*
* THis file contains the CPU (host) code.
*/

// Host defines
#define NUM_THREADS 32
#define STR_SIZE 50

// Includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// CUDA includes
#include <cutil.h> // CUDA Utility Tools

// GPU Kernels declarations - declare as inlcude
#include <HelloWorld_kernel.cu>

//////////////////////
// Program main
//////////////////////
int main( int argc, char** argv)
{

// Host variables
int i,nBytes;
unsigned int timer;
unsigned int num_threads;
char *cpu_odata;
char *string;

// GPU variables
char *gpu_odata;
int str_size;

/////////////////////////////////////////////////////////////////////
// This routine gets the number of GPUs existing in the computer
// For each GPU (device) found, it checks to see if there is a GPU
// that supports CUDA. If no GPU that supports CUDA is found,
// the routine wll exit
/////////////////////////////////////////////////////////////////////
CUT_DEVICE_INIT();

/////////////////////////////////////////////////////////////////////
// Create and start a timer called "timer"
// alls to create ans start times are enveloped in the CUT_SAFE_CALL
// This CUDA Utility Tool checks for errors upon return.
// If an error is found, it prints out and error message, file name,
// and line number in file where the error can be found
/////////////////////////////////////////////////////////////////////
timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));

// Initialize CPU variables and allocate required memory
num_threads = (unsigned int) NUM_THREADS;
nBytes = num_threads*STR_SIZE*sizeof(char);

// Allocate and initialize CPU output vector
string = (char *) malloc(STR_SIZE);
if(!string) {
printf("Cannot allocate string memory on CPU\n");
exit(-1);
}
cpu_odata = (char *) malloc(nBytes);
if(!cpu_odata) {
printf("Cannot allocate cpu_odata memory on CPU\n");
exit(-1);
}

// Allocate GPU (device) memory and variables
str_size = (int) STR_SIZE;
CUDA_SAFE_CALL(cudaMalloc( (void**) &gpu_odata, nBytes));

// Setup kernel execution parameters
dim3 grid(1,1,1);
dim3 threads(num_threads,1,1);

// Execute the kernel on the GPU
HelloWorld_kernel<<< grid, threads >>>(str_size, gpu_odata);

// Check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");

// Copy result from GPU to CPU
CUDA_SAFE_CALL(cudaMemcpy(cpu_odata,gpu_odata,nBytes,cudaMemcpyDeviceToHost));

// Stop the timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue(timer));

// Delete the timer
CUT_SAFE_CALL(cutDeleteTimer(timer));

// Output results is same as the expected solution
for(i=0;i<num_threads;i++) {
strncpy(string,&cpu_odata[i*STR_SIZE],STR_SIZE);
printf("From thread %d: %s\n",i,string);
}


//////////////////////////////////////////
// All done - clean up and exit
//////////////////////////////////////////
// Free up CPU memory
free(cpu_odata);

// Free up GPU memory
CUDA_SAFE_CALL(cudaFree(gpu_odata));

// Use CUDA Utility Tool to exit cleanly
CUT_EXIT(argc, argv);
}

==========================
Makefile

==========================

################################################################################
#
# Build script for project
#
################################################################################

# Add source files here
EXECUTABLE := HelloWorld
# CUDA source files (compiled with cudacc)
CUFILES := HelloWorld.cu
# CUDA dependency files
CU_DEPS := \
HelloWorld_kernel.cu \

# C/C++ source files (compiled with gcc / c++)
CCFILES := \



################################################################################
# Rules and targets

include ../../common/common.mk


Ref : Western Australian Super computer Program, University of Western Australia , 2008, April, 1- 3, 12-2 pm.

4 comments:

  1. too many typos... skip this page if you can..

    ReplyDelete
  2. thats & l t ; with < and & g t ; with >

    ReplyDelete
  3. Meh. Not that big a deal. Replace < with < and > with > replace rsize with size, and add argc, argv to the init call.Do all that and it's a pretty decent example.

    ReplyDelete
  4. Who knows where to download XRumer 5.0 Palladium?
    Help, please. All recommend this program to effectively advertise on the Internet, this is the best program!

    ReplyDelete