\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{m\_amendola} \pdfinfo{ /Title (cuda-programming.pdf) /Creator (Cheatography) /Author (m\_amendola) /Subject (CUDA Programming Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{32A321} \definecolor{LightBackground}{HTML}{F2F9F1} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{CUDA Programming Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{m\_amendola} via \textcolor{DarkBackground}{\uline{cheatography.com/165228/cs/35194/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}m\_amendola \\ \uline{cheatography.com/m-amendola} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 22nd July, 2023.\\ Updated 3rd November, 2022.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Cuda Kernels}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{A CUDA Kernel function is defined using the \_\_global\_\_ keyword.} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{A Kernel is executed N times in parallel by N different threads on the device} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Each thread has a unique ID stored in the built-in {\emph{threadIdx}} variable, a struct with components x,y,z.} \tn % Row Count 7 (+ 3) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Each thread block has a unique ID stored in the built-in {\emph{blockIdx}} variable, a struct with components x,y,z.} \tn % Row Count 10 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.44 cm} x{4.56 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Kernel Configuration}} \tn % Row 0 \SetRowColor{LightBackground} Kernel Execution Configuration & `kernelFunction\textless{}\textless{}\textless{}num\_blocks, num\_threads\textgreater{}\textgreater{}\textgreater{}(params)` \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} num\_blocks & The number of thread blocks along each dimension of the grid. \tn % Row Count 6 (+ 3) % Row 2 \SetRowColor{LightBackground} num\_threads & The number of threads along each dimension of the thread block \tn % Row Count 9 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{CUDA Thread Organization}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Thread are grouped in blocks and can be organized in 1 to 3 dimensions.} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Blocks are grouped into grids which can be organized in 1 to 3 dimensions.} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Blocks are executed independently.} \tn % Row Count 5 (+ 1) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{1D Grid of 1D Blocks}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/m-amendola_1667466962_Immagine 2022-11-03 101315.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`int index = blockIdx.x * blockDim.x + threadIdx.x;`} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{1D Grid of 3D Blocks}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/m-amendola_1667467096_2.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{`int index = blockIdx.x {\emph{ blockDim.x }} blockDim.y {\emph{ blockDim.z + threadIdx.z }} blockDim.y {\emph{ blockDim.x + threadIdx.y }} blockDim.x + threadIdx.x;`} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{2D Grid of 2D Blocks applied on a Matrix}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/m-amendola_1667468966_2D.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{The index of each thread is identified by two coordinates i and j. \newline We can find i applying the rule of 1D Grid of 1D Blocks over the x axis: \newline `int i = blockIdx.x {\emph{ blockDim.x + threadIdx.x;` \newline And we can find j applying the rule of 1D Grid of 1D Blocks over the y axis: \newline `int j = blockIdx.y }} blockDim.y + threadIdx.y;` \newline Thus, knowing that a row in the grid is large {\emph{GridDim.x times BlockDim.x}}, we can calculate the index: \newline `int index = j{\emph{ gridDim.x }} blockDim.x +i;`} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.84 cm} x{4.16 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{CUDA Events}} \tn % Row 0 \SetRowColor{LightBackground} Declaring a Cuda Event & `cudaEvent\_t event;` \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} Allocating the event & \seqsplit{`cudaEventCreate(\&event);`} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} Recording the Event. & \seqsplit{`cudaEventRecord(event);`} \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} Synchronizing the event & \seqsplit{`cudaEventSynchronize(event);`} \tn % Row Count 8 (+ 2) % Row 4 \SetRowColor{LightBackground} Find elapsed time between two events & \seqsplit{`cudaEventElapsedTime(\&elapsed}, a, b);` \tn % Row Count 10 (+ 2) % Row 5 \SetRowColor{white} Free event variables & \seqsplit{`cudaEventDestroy(event);`} \tn % Row Count 12 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{CUDA Streams}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{GPU operations on CUDA use execution queues called streams.} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Operations pushed in a stream are executed according to a FIFO policy.} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{There is a default Stream, called {\emph{stream 0}}.} \tn % Row Count 5 (+ 1) % Row 3 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Operations pushed in a non-default stream will be executed after all operations on default stream are emptied.} \tn % Row Count 8 (+ 3) % Row 4 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Operations assigned to default stream introduce implicit synchronization barriers among other streams.} \tn % Row Count 11 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4.48 cm} x{3.52 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{CUDA Streams API}} \tn % Row 0 \SetRowColor{LightBackground} Create a stream & \seqsplit{`cudaStreamCreate(stream1)`;} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} Deallocate a stream & \seqsplit{`cudaStreamDestroy(stream)`} \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} Block host until all operations on a stream are completed. & \seqsplit{`cudaStreamSynchronize(stream);`} \tn % Row Count 7 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{We can use stream to obtain the concurrent execution of the same kernel or different kernels.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4.16 cm} x{3.84 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Synchronization operations}} \tn % Row 0 \SetRowColor{LightBackground} Explicit Synchronization & Implicit Synchronization \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} {\emph{cudaDeviceSynchronize()}} blocks host code until all operations on device are completed & Operations assigned to default stream \tn % Row Count 7 (+ 5) % Row 2 \SetRowColor{LightBackground} {\emph{cudaStreamWaitEvent(stream, event)}} blocks all operations assigned to a stream until event is reached. & Memory Allocations on device \tn % Row Count 13 (+ 6) % Row 3 \SetRowColor{white} & Settings operations on device \tn % Row Count 15 (+ 2) % Row 4 \SetRowColor{LightBackground} & Page-locked memory allocations \tn % Row Count 17 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{CUDA API}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{\seqsplit{https://docs.nvidia.com/cuda/cuda-runtime-api/index.html}% Row Count 2 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Memory Workflow}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{First we allocate and "build" the input on the {\bf{host}}. \newline % Row Count 2 (+ 2) Then we allocate dynamic memory on the {\bf{device}}, obtaining pointers to the allocated memory areas. \newline % Row Count 5 (+ 3) Finally, we {\bf{initialize}} the memory on the device and we {\bf{copy}} the memory from the host to the device. \newline % Row Count 8 (+ 3) At the end of the computation, we may want to copy the memory from the device to the host.% Row Count 10 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Copy operation is {\emph{blocking}}.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{2.204 cm} x{2.66 cm} x{2.736 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{8.4cm}}{\bf\textcolor{white}{Memory Allocation API Functions}} \tn % Row 0 \SetRowColor{LightBackground} Dynamic memory allocation & `cudaMalloc ((void **) \&udev, \seqsplit{N*sizeof(double));`} & {\emph{u\_dev}} is the pointer to the allocated variable \tn % Row Count 4 (+ 4) % Row 1 \SetRowColor{white} Memory \seqsplit{Initialization} on device & \seqsplit{`cudaMemset(void} *devPtr, int val, size\_t count;` & {\emph{devPtr}} is a pointer to the device address space. The function fills the first {\emph{count}} bytes of the memory area with the constant byte value {\emph{val}}. \tn % Row Count 15 (+ 11) % Row 2 \SetRowColor{LightBackground} Copying data from host to device & \seqsplit{`cudaMemCpy(void} {\emph{dst, void }}src, size\_t size, \seqsplit{cudaMemcpyHostToDevice);`} & {\emph{dst}} is the destination address, {\emph{src}} is the source address, size is the size in bytes of data to copy and the last parameter is the direction of the copy. \tn % Row Count 27 (+ 12) % Row 3 \SetRowColor{white} Copying data from device to host & \seqsplit{`cudaMemCpy(void} {\emph{dst, void }}src, size\_t size, \seqsplit{cudaMemcpyDeviceToHost);`} & \tn % Row Count 33 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}---} \SetRowColor{LightBackground} \mymulticolumn{3}{x{8.4cm}}{After 4.0, CUDA supports {\bf{Unified Virtual Addressing}} meaning that the systems itself knows where the buffer is allocated. The {\emph{direction}} parameter must be set to {\bf{cudaMemcpyDefault}}.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Global Memory}} \tn % Row 0 \SetRowColor{LightBackground} Declaring a static variable & `\_\_device\_\_ type variable\_name;` \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} Declaring a dynamic variable & `cudaMalloc((void **) \&ptr, size);` \tn % Row Count 4 (+ 2) % Row 2 \SetRowColor{LightBackground} Deallocating a dynamic variable & `cudaFree(ptr)` \tn % Row Count 6 (+ 2) % Row 3 \SetRowColor{white} Allocating an aligned 2D buffer where elements are padded so that each row is aligned & \seqsplit{`cudaMallocPitch(\&ptr}, \&pitch, width*sizeof(float), height)` \tn % Row Count 11 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{cudaMallocPitch returns an integer pitch that can be used to access row element with stride access. For example: \newline `float ∗row = devPtr + r ∗ pitch;`} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4.48 cm} x{3.52 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Shared Memory}} \tn % Row 0 \SetRowColor{LightBackground} Static variable declaration inside the kernel. & `\_\_shared\_\_ type shmem{[}SIZE{]};` \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Dynamic variable allocation outside the kernel & `extern \_\_shared\_\_ type *shmem;` \tn % Row Count 6 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{3.52 cm} x{4.48 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Constant memory}} \tn % Row 0 \SetRowColor{LightBackground} Declaring a static variable & `\_\_constant\_\_ type variable\_name;` \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} Copy memory from host to device. & \seqsplit{`cudaMemcpyToSymbol(variable\_name}, \&host\_src, sizeof(type), \seqsplit{cudaMemcpyHostToDevice);`} \tn % Row Count 6 (+ 4) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{We cannot declare a dynamic variable on the costant memory} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Texture Memory}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{Managing texture memory} \tn % Row Count 1 (+ 1) % Row 1 \SetRowColor{white} Allocate global memory on device & `cudaMalloc(\&M, memsize)` \tn % Row Count 3 (+ 2) % Row 2 \SetRowColor{LightBackground} Create a texture reference. & `texture\textless{}datatype, dim\textgreater{} MtextureRef;` \tn % Row Count 5 (+ 2) % Row 3 \SetRowColor{white} Create a channel descriptor & \seqsplit{`cudaChannelFormatDesc} Mdesc = cudaCreateChannelDesc\textless{}datatype\textgreater{}();` \tn % Row Count 9 (+ 4) % Row 4 \SetRowColor{LightBackground} Bind the texture reference to memory. & `cudaBindTexture(0, MtextureRef, M, Mdesc)` \tn % Row Count 12 (+ 3) % Row 5 \SetRowColor{white} Unbind at the end. & \seqsplit{`cudaUnbindTexture(MTextureRef);`} \tn % Row Count 14 (+ 2) % Row 6 \SetRowColor{LightBackground} In order to access the texture memory, we can use the texture reference {\emph{MtextureRef}}.* & \seqsplit{`text1Dfetch(MtextureRef}, address);` \tn % Row Count 19 (+ 5) % Row 7 \SetRowColor{white} Accessing 2D cuda array. & \seqsplit{`text2Dfetch(MtextureRef}, address);` \tn % Row Count 21 (+ 2) % Row 8 \SetRowColor{LightBackground} Accessing 3D cuda array. & \seqsplit{`text3Dfetch(MtextureRef}, address);` \tn % Row Count 23 (+ 2) \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{4 cm} x{4 cm} } \SetRowColor{DarkBackground} \mymulticolumn{2}{x{8.4cm}}{\bf\textcolor{white}{Asynchronous Data Transfers}} \tn % Row 0 \SetRowColor{LightBackground} Allocates page-locked memory on the host. & \seqsplit{`cudaMallocHost(buffer}, size)` \tn % Row Count 3 (+ 3) % Row 1 \SetRowColor{white} Frees page-locked memory. & \seqsplit{`cudaFreeHost(buffer)`} \tn % Row Count 5 (+ 2) % Row 2 \SetRowColor{LightBackground} Registers an existing host memory range for use by CUDA. & \seqsplit{`cudaHostRegister()`} \tn % Row Count 8 (+ 3) % Row 3 \SetRowColor{white} Unregisters a memory range that was registered with cudaHostRegister. & \seqsplit{`cudaHostUnregister()`} \tn % Row Count 12 (+ 4) % Row 4 \SetRowColor{LightBackground} Copies data between host and device. & \seqsplit{`cudaMemcpyAsync(dest\_buffer}, src\_buffer, dest\_size, src\_size, direction,stream)` \tn % Row Count 17 (+ 5) \hhline{>{\arrayrulecolor{DarkBackground}}--} \SetRowColor{LightBackground} \mymulticolumn{2}{x{8.4cm}}{These operations must be queued into a non-default stream.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}--} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Page-locked Memory}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Pageable memory}} is memory which is allowed to be paged in or paged out whereas {\bf{page-locked memory}} is memory not allowed to be paged in or paged out. \newline % Row Count 4 (+ 4) {\emph{Page out}} is moving data from RAM to HDD, while {\emph{page in}} means moving data from HDD to RAM. These operations occurs when the main memory does not have enough free space.% Row Count 8 (+ 4) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Source: \seqsplit{https://leimao.github.io/blog/Page-Locked-Host-Memory-Data-Transfer/}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Error Handling}} \tn % Row 0 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{All CUDA API functions returns an error code of type {\emph{cudaError}}.} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{The constant {\emph{cudaSuccess}} means no error.} \tn % Row Count 3 (+ 1) % Row 2 \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{{\emph{cudaGetLastError}} return the status of the internal error variable. Calling this function resets the internal error to cudaSuccess.} \tn % Row Count 6 (+ 3) \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Macro for Error Handling}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\#define CUDA\_CHECK(X) \{\textbackslash{} \newline cudaError\_t \_m\_cudaStat = X;\textbackslash{} \newline if(cudaSuccess != \_m\_cudaStat) \{\textbackslash{} \newline fprintf(stderr,"\textbackslash{}nCUDA\_ERROR: \%s in file \%s line \%d\textbackslash{}n",\textbackslash{} \newline cudaGetErrorString(\_m\_cudaStat), \_\_FILE\_\_, \_\_LINE\_\_);\textbackslash{} \newline exit(1);\textbackslash{} \newline \} \} \newline ... \newline CUDA\_CHECK( cudaMemcpy(d\_buf, h\_buf, buffSize, \newline cudaMemcpyHostToDevice) );} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}