\documentclass[10pt,a4paper]{article}

% Packages
\usepackage{fancyhdr}           % For header and footer
\usepackage{multicol}           % Allows multicols in tables
\usepackage{tabularx}           % Intelligent column widths
\usepackage{tabulary}           % Used in header and footer
\usepackage{hhline}             % Border under tables
\usepackage{graphicx}           % For images
\usepackage{xcolor}             % For hex colours
%\usepackage[utf8x]{inputenc}    % For unicode character support
\usepackage[T1]{fontenc}        % Without this we get weird character replacements
\usepackage{colortbl}           % For coloured tables
\usepackage{setspace}           % For line height
\usepackage{lastpage}           % Needed for total page number
\usepackage{seqsplit}           % Splits long words.
%\usepackage{opensans}          % Can't make this work so far. Shame. Would be lovely.
\usepackage[normalem]{ulem}     % For underlining links
% Most of the following are not required for the majority
% of cheat sheets but are needed for some symbol support.
\usepackage{amsmath}            % Symbols
\usepackage{MnSymbol}           % Symbols
\usepackage{wasysym}            % Symbols
%\usepackage[english,german,french,spanish,italian]{babel}              % Languages

% Document Info
\author{m\_amendola}
\pdfinfo{
  /Title (cuda-programming.pdf)
  /Creator (Cheatography)
  /Author (m\_amendola)
  /Subject (CUDA Programming Cheat Sheet)
}

% Lengths and widths
\addtolength{\textwidth}{6cm}
\addtolength{\textheight}{-1cm}
\addtolength{\hoffset}{-3cm}
\addtolength{\voffset}{-2cm}
\setlength{\tabcolsep}{0.2cm} % Space between columns
\setlength{\headsep}{-12pt} % Reduce space between header and content
\setlength{\headheight}{85pt} % If less, LaTeX automatically increases it
\renewcommand{\footrulewidth}{0pt} % Remove footer line
\renewcommand{\headrulewidth}{0pt} % Remove header line
\renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit
% This two commands together give roughly
% the right line height in the tables
\renewcommand{\arraystretch}{1.3}
\onehalfspacing

% Commands
\newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour
\newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols
\newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns
\newcommand{\tn}{\tabularnewline} % Required as custom column type in use

% Font and Colours
\definecolor{HeadBackground}{HTML}{333333}
\definecolor{FootBackground}{HTML}{666666}
\definecolor{TextColor}{HTML}{333333}
\definecolor{DarkBackground}{HTML}{32A321}
\definecolor{LightBackground}{HTML}{F2F9F1}
\renewcommand{\familydefault}{\sfdefault}
\color{TextColor}

% Header and Footer
\pagestyle{fancy}
\fancyhead{} % Set header to blank
\fancyfoot{} % Set footer to blank
\fancyhead[L]{
\noindent
\begin{multicols}{3}
\begin{tabulary}{5.8cm}{C}
    \SetRowColor{DarkBackground}
    \vspace{-7pt}
    {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent
        \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}}
    }
\end{tabulary}
\columnbreak
\begin{tabulary}{11cm}{L}
    \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{CUDA Programming Cheat Sheet}}}} \\
    \normalsize{by \textcolor{DarkBackground}{m\_amendola} via \textcolor{DarkBackground}{\uline{cheatography.com/165228/cs/35194/}}}
\end{tabulary}
\end{multicols}}

\fancyfoot[L]{ \footnotesize
\noindent
\begin{multicols}{3}
\begin{tabulary}{5.8cm}{LL}
  \SetRowColor{FootBackground}
  \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}}  \\
  \vspace{-2pt}m\_amendola \\
  \uline{cheatography.com/m-amendola} \\
  \end{tabulary}
\vfill
\columnbreak
\begin{tabulary}{5.8cm}{L}
  \SetRowColor{FootBackground}
  \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}}  \\
   \vspace{-2pt}Published 22nd July, 2023.\\
   Updated 3rd November, 2022.\\
   Page {\thepage} of \pageref{LastPage}.
\end{tabulary}
\vfill
\columnbreak
\begin{tabulary}{5.8cm}{L}
  \SetRowColor{FootBackground}
  \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}}  \\
  \SetRowColor{white}
  \vspace{-5pt}
  %\includegraphics[width=48px,height=48px]{dave.jpeg}
  Measure your website readability!\\
  www.readability-score.com
\end{tabulary}
\end{multicols}}


\begin{document}
\raggedright
\raggedcolumns

% Set font size to small. Switch to any value
% from this page to resize cheat sheet text:
% www.emerson.emory.edu/services/latex/latex_169.html
\footnotesize % Small font.

\begin{multicols*}{2}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Cuda Kernels}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{A CUDA Kernel function is defined using the \_\_global\_\_ keyword.} \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{A Kernel is executed N times in parallel by N different threads on the device} \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{Each thread has a unique ID stored in the built-in {\emph{threadIdx}} variable, a struct with components x,y,z.} \tn 
% Row Count 7 (+ 3)
% Row 3
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{Each thread block has a unique ID stored in the built-in {\emph{blockIdx}} variable, a struct with components x,y,z.} \tn 
% Row Count 10 (+ 3)
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{3.44 cm} x{4.56 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Kernel Configuration}}  \tn
% Row 0
\SetRowColor{LightBackground}
Kernel Execution Configuration & `kernelFunction\textless{}\textless{}\textless{}num\_blocks, num\_threads\textgreater{}\textgreater{}\textgreater{}(params)` \tn 
% Row Count 3 (+ 3)
% Row 1
\SetRowColor{white}
num\_blocks & The number of thread blocks along each dimension of the grid. \tn 
% Row Count 6 (+ 3)
% Row 2
\SetRowColor{LightBackground}
num\_threads & The number of threads along each dimension of the thread block \tn 
% Row Count 9 (+ 3)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{CUDA Thread Organization}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{Thread are grouped in blocks and can be organized in 1 to 3 dimensions.} \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{Blocks are grouped into grids which can be organized in 1 to 3 dimensions.} \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{Blocks are executed independently.} \tn 
% Row Count 5 (+ 1)
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{1D Grid of 1D Blocks}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/m-amendola_1667466962_Immagine 2022-11-03 101315.png}}} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{`int index = blockIdx.x * blockDim.x + threadIdx.x;`}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{1D Grid of 3D Blocks}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/m-amendola_1667467096_2.png}}} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{`int index = blockIdx.x {\emph{ blockDim.x }} blockDim.y {\emph{ blockDim.z + threadIdx.z }} blockDim.y {\emph{ blockDim.x + threadIdx.y }} blockDim.x + threadIdx.x;`}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{2D Grid of 2D Blocks applied on a Matrix}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/m-amendola_1667468966_2D.png}}} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{The index of each thread is identified by two coordinates i and j. \newline We can find i applying the rule of 1D Grid of 1D Blocks over the x axis: \newline `int i = blockIdx.x {\emph{ blockDim.x + threadIdx.x;` \newline And we can find j applying the rule of 1D Grid of 1D Blocks over the y axis: \newline `int j = blockIdx.y }} blockDim.y + threadIdx.y;` \newline Thus, knowing that a row in the grid is large {\emph{GridDim.x times BlockDim.x}}, we can calculate the index: \newline `int index = j{\emph{ gridDim.x }} blockDim.x +i;`}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{3.84 cm} x{4.16 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{CUDA Events}}  \tn
% Row 0
\SetRowColor{LightBackground}
Declaring a Cuda Event & `cudaEvent\_t event;` \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
Allocating the event & \seqsplit{`cudaEventCreate(\&event);`} \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
Recording the Event. & \seqsplit{`cudaEventRecord(event);`} \tn 
% Row Count 6 (+ 2)
% Row 3
\SetRowColor{white}
Synchronizing the event & \seqsplit{`cudaEventSynchronize(event);`} \tn 
% Row Count 8 (+ 2)
% Row 4
\SetRowColor{LightBackground}
Find elapsed time between two events & \seqsplit{`cudaEventElapsedTime(\&elapsed}, a, b);` \tn 
% Row Count 10 (+ 2)
% Row 5
\SetRowColor{white}
Free event variables & \seqsplit{`cudaEventDestroy(event);`} \tn 
% Row Count 12 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{CUDA Streams}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{GPU operations on CUDA use execution queues called streams.} \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{Operations pushed in a stream are executed according to a FIFO policy.} \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{There is a default Stream, called {\emph{stream 0}}.} \tn 
% Row Count 5 (+ 1)
% Row 3
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{Operations pushed in a non-default stream will be executed after all operations on default stream are emptied.} \tn 
% Row Count 8 (+ 3)
% Row 4
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{Operations assigned to default stream introduce implicit synchronization barriers among other streams.} \tn 
% Row Count 11 (+ 3)
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{4.48 cm} x{3.52 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{CUDA Streams API}}  \tn
% Row 0
\SetRowColor{LightBackground}
Create a stream & \seqsplit{`cudaStreamCreate(stream1)`;} \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
Deallocate a stream & \seqsplit{`cudaStreamDestroy(stream)`} \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
Block host until all operations on a stream are completed. & \seqsplit{`cudaStreamSynchronize(stream);`} \tn 
% Row Count 7 (+ 3)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{8.4cm}}{We can use stream to obtain the concurrent execution of the same kernel or different kernels.}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{4.16 cm} x{3.84 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Synchronization operations}}  \tn
% Row 0
\SetRowColor{LightBackground}
Explicit Synchronization & Implicit Synchronization \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
{\emph{cudaDeviceSynchronize()}} blocks host code until all operations on device are completed & Operations assigned to default stream \tn 
% Row Count 7 (+ 5)
% Row 2
\SetRowColor{LightBackground}
{\emph{cudaStreamWaitEvent(stream, event)}} blocks all operations assigned to a stream until event is reached. & Memory Allocations on device \tn 
% Row Count 13 (+ 6)
% Row 3
\SetRowColor{white}
 & Settings operations on device \tn 
% Row Count 15 (+ 2)
% Row 4
\SetRowColor{LightBackground}
 & Page-locked memory allocations \tn 
% Row Count 17 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{CUDA API}}  \tn
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{\seqsplit{https://docs.nvidia.com/cuda/cuda-runtime-api/index.html}% Row Count 2 (+ 2)
} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Memory Workflow}}  \tn
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{First we allocate and "build" the input on the {\bf{host}}. \newline % Row Count 2 (+ 2)
Then we allocate dynamic memory on the {\bf{device}}, obtaining pointers to the allocated memory areas. \newline % Row Count 5 (+ 3)
Finally, we {\bf{initialize}} the memory on the device and we {\bf{copy}} the memory from the host to the device. \newline % Row Count 8 (+ 3)
At the end of the computation, we may want to copy the memory from the device to the host.% Row Count 10 (+ 2)
} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{Copy operation is {\emph{blocking}}.}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{2.204 cm} x{2.66 cm} x{2.736 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{3}{x{8.4cm}}{\bf\textcolor{white}{Memory Allocation API Functions}}  \tn
% Row 0
\SetRowColor{LightBackground}
Dynamic memory allocation & `cudaMalloc ((void **) \&udev, \seqsplit{N*sizeof(double));`} & {\emph{u\_dev}} is the pointer to the allocated variable \tn 
% Row Count 4 (+ 4)
% Row 1
\SetRowColor{white}
Memory \seqsplit{Initialization} on device & \seqsplit{`cudaMemset(void} *devPtr, int val, size\_t count;` & {\emph{devPtr}} is a pointer to the device address space. The function fills the first {\emph{count}} bytes of the memory area with the constant byte value {\emph{val}}. \tn 
% Row Count 15 (+ 11)
% Row 2
\SetRowColor{LightBackground}
Copying data from host to device & \seqsplit{`cudaMemCpy(void} {\emph{dst, void }}src, size\_t size, \seqsplit{cudaMemcpyHostToDevice);`} & {\emph{dst}} is the destination address, {\emph{src}} is the source address, size is the size in bytes of data to copy and the last parameter is the direction of the copy. \tn 
% Row Count 27 (+ 12)
% Row 3
\SetRowColor{white}
Copying data from device to host & \seqsplit{`cudaMemCpy(void} {\emph{dst, void }}src, size\_t size, \seqsplit{cudaMemcpyDeviceToHost);`} &  \tn 
% Row Count 33 (+ 6)
\hhline{>{\arrayrulecolor{DarkBackground}}---}
\SetRowColor{LightBackground}
\mymulticolumn{3}{x{8.4cm}}{After 4.0, CUDA supports {\bf{Unified Virtual Addressing}} meaning that the systems itself knows where the buffer is allocated. The {\emph{direction}} parameter must be set to {\bf{cudaMemcpyDefault}}.}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}---}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Global Memory}}  \tn
% Row 0
\SetRowColor{LightBackground}
Declaring a static variable & `\_\_device\_\_ type variable\_name;` \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
Declaring a dynamic variable & `cudaMalloc((void **) \&ptr, size);` \tn 
% Row Count 4 (+ 2)
% Row 2
\SetRowColor{LightBackground}
Deallocating a dynamic variable & `cudaFree(ptr)` \tn 
% Row Count 6 (+ 2)
% Row 3
\SetRowColor{white}
Allocating an aligned 2D buffer where elements are padded so that each row is aligned & \seqsplit{`cudaMallocPitch(\&ptr}, \&pitch, width*sizeof(float), height)` \tn 
% Row Count 11 (+ 5)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{8.4cm}}{cudaMallocPitch returns an integer pitch that can be used to access row element with stride access. For example: \newline `float ∗row = devPtr + r ∗ pitch;`}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{4.48 cm} x{3.52 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Shared Memory}}  \tn
% Row 0
\SetRowColor{LightBackground}
Static variable declaration inside the kernel. & `\_\_shared\_\_ type shmem{[}SIZE{]};` \tn 
% Row Count 3 (+ 3)
% Row 1
\SetRowColor{white}
Dynamic variable allocation outside the kernel & `extern \_\_shared\_\_ type *shmem;` \tn 
% Row Count 6 (+ 3)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{3.52 cm} x{4.48 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Constant memory}}  \tn
% Row 0
\SetRowColor{LightBackground}
Declaring a static variable & `\_\_constant\_\_ type variable\_name;` \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
Copy memory from host to device. & \seqsplit{`cudaMemcpyToSymbol(variable\_name}, \&host\_src, sizeof(type), \seqsplit{cudaMemcpyHostToDevice);`} \tn 
% Row Count 6 (+ 4)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{8.4cm}}{We cannot declare a dynamic variable on the costant memory}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Texture Memory}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{8.4cm}}{Managing texture memory} \tn 
% Row Count 1 (+ 1)
% Row 1
\SetRowColor{white}
Allocate global memory on device & `cudaMalloc(\&M, memsize)` \tn 
% Row Count 3 (+ 2)
% Row 2
\SetRowColor{LightBackground}
Create a texture reference. & `texture\textless{}datatype, dim\textgreater{} MtextureRef;` \tn 
% Row Count 5 (+ 2)
% Row 3
\SetRowColor{white}
Create a channel descriptor & \seqsplit{`cudaChannelFormatDesc} Mdesc = cudaCreateChannelDesc\textless{}datatype\textgreater{}();` \tn 
% Row Count 9 (+ 4)
% Row 4
\SetRowColor{LightBackground}
Bind the texture reference to memory. & `cudaBindTexture(0, MtextureRef, M, Mdesc)` \tn 
% Row Count 12 (+ 3)
% Row 5
\SetRowColor{white}
Unbind at the end. & \seqsplit{`cudaUnbindTexture(MTextureRef);`} \tn 
% Row Count 14 (+ 2)
% Row 6
\SetRowColor{LightBackground}
In order to access the texture memory, we can use the texture reference {\emph{MtextureRef}}.* & \seqsplit{`text1Dfetch(MtextureRef}, address);` \tn 
% Row Count 19 (+ 5)
% Row 7
\SetRowColor{white}
Accessing 2D cuda array. & \seqsplit{`text2Dfetch(MtextureRef}, address);` \tn 
% Row Count 21 (+ 2)
% Row 8
\SetRowColor{LightBackground}
Accessing 3D cuda array. & \seqsplit{`text3Dfetch(MtextureRef}, address);` \tn 
% Row Count 23 (+ 2)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} }
\SetRowColor{DarkBackground}
\mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Asynchronous Data Transfers}}  \tn
% Row 0
\SetRowColor{LightBackground}
Allocates page-locked memory on the host. & \seqsplit{`cudaMallocHost(buffer}, size)` \tn 
% Row Count 3 (+ 3)
% Row 1
\SetRowColor{white}
Frees page-locked memory. & \seqsplit{`cudaFreeHost(buffer)`} \tn 
% Row Count 5 (+ 2)
% Row 2
\SetRowColor{LightBackground}
Registers an existing host memory range for use by CUDA. & \seqsplit{`cudaHostRegister()`} \tn 
% Row Count 8 (+ 3)
% Row 3
\SetRowColor{white}
Unregisters a memory range that was registered with cudaHostRegister. & \seqsplit{`cudaHostUnregister()`} \tn 
% Row Count 12 (+ 4)
% Row 4
\SetRowColor{LightBackground}
Copies data between host and device. & \seqsplit{`cudaMemcpyAsync(dest\_buffer}, src\_buffer, dest\_size, src\_size, direction,stream)` \tn 
% Row Count 17 (+ 5)
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\SetRowColor{LightBackground}
\mymulticolumn{2}{x{8.4cm}}{These operations must be queued into a non-default stream.}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}--}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Page-locked Memory}}  \tn
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{{\bf{Pageable memory}} is memory which is allowed to be paged in or paged out whereas {\bf{page-locked memory}} is memory not allowed to be paged in or paged out. \newline % Row Count 4 (+ 4)
{\emph{Page out}} is moving data from RAM to HDD, while {\emph{page in}} means moving data from HDD to RAM. These operations occurs when the main memory does not have enough free space.% Row Count 8 (+ 4)
} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{Source: \seqsplit{https://leimao.github.io/blog/Page-Locked-Host-Memory-Data-Transfer/}}  \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Error Handling}}  \tn
% Row 0
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{All CUDA API functions returns an error code of type {\emph{cudaError}}.} \tn 
% Row Count 2 (+ 2)
% Row 1
\SetRowColor{white}
\mymulticolumn{1}{x{8.4cm}}{The constant {\emph{cudaSuccess}} means no error.} \tn 
% Row Count 3 (+ 1)
% Row 2
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{{\emph{cudaGetLastError}} return the status of the internal error variable. Calling this function resets the internal error to cudaSuccess.} \tn 
% Row Count 6 (+ 3)
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{8.4cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Macro for Error Handling}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{8.4cm}}{\#define CUDA\_CHECK(X) \{\textbackslash{} \newline cudaError\_t \_m\_cudaStat = X;\textbackslash{} \newline if(cudaSuccess != \_m\_cudaStat) \{\textbackslash{} \newline fprintf(stderr,"\textbackslash{}nCUDA\_ERROR: \%s in file \%s line \%d\textbackslash{}n",\textbackslash{} \newline cudaGetErrorString(\_m\_cudaStat), \_\_FILE\_\_, \_\_LINE\_\_);\textbackslash{} \newline exit(1);\textbackslash{} \newline \} \} \newline ... \newline CUDA\_CHECK( cudaMemcpy(d\_buf, h\_buf, buffSize, \newline cudaMemcpyHostToDevice) );} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}


% That's all folks
\end{multicols*}

\end{document}