\chapter{Comparison of Architecture Features }

\label{sec:architectures-comparison}

\begin{longtable}{ | m{3cm} | m{3mm} | m{3mm} | m{3mm} | m{3mm} | m{3mm} | m{5mm} | m{3mm} | m{3mm} | m{3mm} | m{3mm} | m{3mm} | m{3mm} |}

\hline

SM Generation & \multicolumn{2}{c|}{Fermi} & \multicolumn{4}{c|}{Kepler} & \multicolumn{3}{c|}{Maxwell} & \multicolumn{3}{c|}{Pascal}\\

\cline{1-13}

SM Architecture & 2.0 & 2.1 & 3.0 & 3.2 & 3.5 & 3.7 & 5.0 & 5.2 & 5.3 & 6.0 & 6.1 & 6.2 \\\cline{1-13}

Maximum number of resident grids per device (concurrent kernel execution) & \multicolumn{3}{c|}{16}

& 4 & \multicolumn{4}{c|}{32} & 16 & \multicolumn{1}{c|}{128} & 32 & 16 \\\cline{1-13}

Maximum dimensionality of grid of thread blocks & \multicolumn{12}{c|}{3} \\\cline{1-13}

Maximum $x$ dimension of a grid of thread blocks & \multicolumn{2}{c|}{$65535$} & \multicolumn{10}{c|}{$2^{31}-1$} \\\cline{1-13}

Maximum $y$ , or $z$ dimension of a grid of thread blocks & \multicolumn{12}{c|}{65535} \\\cline{1-13}

Maximum dimensionality of thread block & \multicolumn{12}{c|}{3} \\\cline{1-13}

Maximum $x$ or $y$ dimension of a block & \multicolumn{12}{c|}{1024} \\\cline{1-13}

Maximum $z$ dimension of a block & \multicolumn{12}{c|}{64} \\\cline{1-13}

Maximum number of threads per block & \multicolumn{12}{c|}{1024} \\\cline{1-13}

Warp Size & \multicolumn{12}{c|}{32} \\\cline{1-13}

Maximum number of resident blocks per multiprocessor & \multicolumn{2}{c|}{8} & \multicolumn{4}{c|}{16} & \multicolumn{6}{c|}{32} \\\cline{1-13}

Maximum number of resident warps per multiprocessor & \multicolumn{2}{c|}{$48$} & \multicolumn{10}{c|}{$64$} \\\cline{1-13}

Maximum number of resident threads per multiprocessor & \multicolumn{2}{c|}{$1536$} & \multicolumn{10}{c|}{$2048$} \\\cline{1-13}

Number of 32-bit registers per multiprocessor & \multicolumn{2}{c|}{32K} & \multicolumn{3}{c|}{64K} & \multicolumn{1}{c|}{128K} & \multicolumn{6}{c|}{64K} \\\cline{1-13}

Maximum number of 32-bit registers per thread block & \multicolumn{2}{c|}{32K} & \multicolumn{1}{c|}{64K} & \multicolumn{1}{c|}{32K} & \multicolumn{4}{c|}{64K} & \multicolumn{1}{c|}{32K} & \multicolumn{2}{c|}{64K} & \multicolumn{1}{c|}{32K} \\\cline{1-13}

Maximum registers per thread & \multicolumn{3}{c|}{63} & \multicolumn{9}{c|}{255} \\\cline{1-13}

Maximum amount of shared memory per multiprocessor & \multicolumn{5}{c|}{48K} & \multicolumn{1}{c|}{112K} & \multicolumn{1}{c|}{64K} & \multicolumn{1}{c|}{96K} & \multicolumn{2}{c|}{64K} & \multicolumn{1}{c|}{96K} & \multicolumn{1}{c|}{64K} \\\cline{1-13}

Maximum amount of shared memory per thread block & \multicolumn{12}{c|}{48K} \\\cline{1-13}

Number of shared memory banks & \multicolumn{12}{c|}{32} \\\cline{1-13}

Amount of local memory per thread & \multicolumn{12}{c|}{512K} \\\cline{1-13}

Constant Memory size & \multicolumn{12}{c|}{64K} \\\cline{1-13}

Cache working set per multiprocessor for constant memory & \multicolumn{9}{c|}{8K} & \multicolumn{1}{c|}{4K} & \multicolumn{2}{c|}{8K} \\\cline{1-13}

Maximum number of instructions per kernel & \multicolumn{12}{c|}{512 million} \\\cline{1-13}

\caption{ Technical specifications parameters for NVidia GPU summarized from \cite[p.~218--220]{NVIDIA\_Programming\_guide}.}

\label{tab:technical-comparison}

\end{longtable}

\begin{longtable}{ | m{7.2cm} | m{3mm} | m{3mm} | m{4mm} | m{3mm} | m{5mm} | m{3mm} | m{3mm} | m{3mm} | m{13mm} |}

\hline

SM Generation & \multicolumn{2}{c|}{Fermi} & \multicolumn{3}{c|}{Kepler} & \multicolumn{2}{c|}{Maxwell} & \multicolumn{2}{c|}{Pascal} \\\cline{1-10}

SM Architecture & 2.0 & 2.1 & 3.0 & 3.5 & 3.7 & 5.0 & 5.2 & 6.0 & 6.1,6.2 \\\cline{1-10}

Number of ALU lanes for integer and single-precision floating-point arithmetic operations & 32 & 48 & \multicolumn{3}{c|}{$192$} & \multicolumn{2}{c|}{$128$} & 64 & \multicolumn{1}{c|}{$128$} \\\cline{1-10}

Number of special function units for single-precision floating-point transcendental functions & 4 & 8 & \multicolumn{5}{c|}{$32$} & 16 & \multicolumn{1}{c|}{$32$} \\\cline{1-10}

Number of warp schedulers & \multicolumn{2}{c|}{2} & \multicolumn{5}{c|}{4} & \multicolumn{1}{c|}{2} & \multicolumn{1}{c|}{4} \\\cline{1-10}

Max number of instructions issued at once by a single scheduler & \multicolumn{1}{c|}{1} & \multicolumn{4}{c|}{2} & \multicolumn{4}{c|}{1} \\\cline{1-10}

\caption{ Architecture specifications parameters for NVidia GPU summarized from \cite[p.~83--84]{NVIDIA\_Programming\_guide}.}

\label{tab:architectures-comparison}

\end{longtable}

% \cref{tab:technical-comparison,tab:architectures-comparison}