KEMBAR78
6 Computation | PDF | Computer Hardware | Computer Science
0% found this document useful (0 votes)
20 views11 pages

6 Computation

The document discusses CUDA programming concepts, focusing on kernel functions, thread divergence, and memory models. It includes code snippets illustrating thread execution and divergence issues, as well as an overview of different types of memory in CUDA, such as global, texture, and constant memory. The content emphasizes the importance of optimizing code to reduce thread divergence and improve performance in GPU computing.

Uploaded by

webbstu1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
20 views11 pages

6 Computation

The document discusses CUDA programming concepts, focusing on kernel functions, thread divergence, and memory models. It includes code snippets illustrating thread execution and divergence issues, as well as an overview of different types of memory in CUDA, such as global, texture, and constant memory. The content emphasizes the importance of optimizing code to reduce thread divergence and improve performance in GPU computing.

Uploaded by

webbstu1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 11

CUDA Programming

Recap
__global__
__global__voidvoiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
int
intidid==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x; S0
ifif(id
(id%%2)2)vector[id] id; S1
vector[id]==id;
else
elsevector[id]
vector[id]==vectorsize
vectorsize**vectorsize;
vectorsize; S2
vector[id]++;
vector[id]++; S3
}}

0 1 2 3 4 5 6 7

S0 S0 S0 S0 S0 S0 S0 S0 NOP

S1 S1 S1 S1
Time

S2 S2 S2 S2

S3 S3 S3 S3 S3 S3 S3 S3 2
Classwork

Rewrite the following program fragment to
remove thread-divergence.
assert(x
assert(x== ==yy||||xx==
==zz||||xx==
==ww););
ifif(x
(x==
==y)y)xx==zz++w;
w;
else
elseif(
if(xx==
==zz))xx==ww++y;y;
else
elsexx==yy++z;z;

assert(x
assert(x====yy||||xx==
==zz||||xx==
==ww););
xx==yy++zz++ww––x;x;

3
Classwork

How many steps does warp threads take to
execute?
__global__
__global__voidvoiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
int
intid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
ifif((id
id<=
<=00 ){){
vector[id]
vector[id]==0;0;
for
for(int
(inti=1;i<=100;i++){
i=1;i<=100;i++){
vector[id]
vector[id]+=
+=i;i;
}}
}}
else{
else{
vector[id]
vector[id]==1;1;
}}
}} 4
Classwork

How many steps does warp threads take to
execute?
__global__
__global__voidvoiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
int
intid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
ifif((id
id<=
<=00 ){){
vector[id]
vector[id]==(101*100)
(101*100)//2;2;
}}
else{
else{
vector[id]
vector[id]==1;1;
}}
}}

5
Classwork

How many steps does warp threads take to
execute?
__global__
__global__void
voiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
int
intid
id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
vector[id]
vector[id]==((11++(((-id)>>31)
(-id)>>31)))**((((101*100)
((101*100)//2)
2)--11))++11; ;
}}

6
Thread-Divergence
__global__
__global__void
voiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
switch
switch(id)
(id){{
case
case0:0:vector[id]
vector[id]==0;0; break;
break;
case
case1:1:vector[id]
vector[id]==vector[id];
vector[id]; break;
break;
case
case2:2:vector[id]
vector[id]==vector[id
vector[id--2];
2]; break;
break;
case
case3:3:vector[id]
vector[id]==vector[id
vector[id++3];
3]; break;
break;
case
case4:4:vector[id]
vector[id]==44++44++vector[id];
vector[id]; break;
break;
case
case5:5:vector[id]
vector[id]==55--vector[id];
vector[id]; break;
break;
case
case6:6:vector[id]
vector[id]==vector[6];
vector[6]; break;
break;
case
case7:7:vector[id]
vector[id]==77++7;7; break;
break;
case
case8:8:vector[id]
vector[id]==vector[id]
vector[id]++8;8; break;
break;
case
case9:9:vector[id]
vector[id]==vector[id]
vector[id]**9;9; break;
break;
}}
}}
How
How many
many steps
steps will
will the
the warp
warp threads
threads take?
take?
7
Thread-Divergence
__global__
__global__voidvoiddkernel()
dkernel()
{{
ifif(threadidx.x
(threadidx.x<16)
<16)
{{
printf(“Inside
printf(“InsideIf”);
If”);
Global_Barrier();
Global_Barrier();
}}
else
elseifif(threadidx
(threadidx>=16)
>=16)
{{
printf(“Inside
printf(“Insideelse”);
else”);
Global_Barrier();
Global_Barrier();
}}
}}

What
What is
is the
the Output?
Output?
Deadlock!!
Deadlock!! 8
Memory
Agenda

Computation

Memory

Synchronization

Functions

Support

Topics

10
CUDA Memory Model Overview
• Global / Video memory
– Main means of communicating data
Grid
between host and device
Block (0, 0)‫‏‬ Block (1, 0)‫‏‬
– Contents visible to all GPU threads
– Long latency access (400-800 cycles) Shared Memory Shared Memory

– Throughput ~200 GBPS Registers Registers Registers Registers

• Texture Memory Thread (0, 0)‫‏‬ Thread (1, 0)‫‏‬ Thread (0, 0)‫‏‬ Thread (1, 0)‫‏‬

– Read-only (12 KB)


– ~800 GBPS Host Global Memory

– Optimized for 2D spatial locality


• Constant Memory
– Read-only (64 KB)
1111
The numbers are typical values.

You might also like