\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Jia Yan} \pdfinfo{ /Title (snowflake-snowpro-core-certification.pdf) /Creator (Cheatography) /Author (Jia Yan) /Subject (Snowflake SnowPro Core Certification Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{A3A3A3} \definecolor{LightBackground}{HTML}{F3F3F3} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Snowflake SnowPro Core Certification Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Jia Yan} via \textcolor{DarkBackground}{\uline{cheatography.com/180109/cs/37482/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Jia Yan \\ \uline{cheatography.com/jia-yan} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 13th March, 2023.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Snowflake Database Architecture}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/jia-yan_1677950117_Screenshot 2023-03-04 at 5.14.45 PM.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{1. {\bf{Service Layer}}: Accepts SQL requests from users, coordinates queries, managing transactions and results. It holds a cached copy of the results. \newline 2. {\bf{Compute Layer}}: Where the actual SQL is executed across the nodes of a VW. It holds a cache of data queried (referred as Local Disk I/O, SSD storage in reality). All data held is temporary -VW is active \newline 3. {\bf{Storage Layer}}: Provides long term storage of results (Remote Disk - implemented on either Amazon S3 or Microsoft Blob storage)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Virtual Warehouses}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Overview}} \newline % Row Count 1 (+ 1) • resources provision for executing SQL SELECT statements and DML operations (DELETE, INSERT, UPDATE, COPY INTO) \newline % Row Count 4 (+ 3) • Can be started and stopped/resized at any time, running queries are not affected but only the new queries \newline % Row Count 7 (+ 3) • Two types: Standard and Snowpark-optimized \newline % Row Count 8 (+ 1) {\bf{Credit Usage and Billing}} \newline % Row Count 9 (+ 1) • Snowflake utilizes per-second billing (with a 60-second minimum each time the warehouse starts) so warehouses are billed only for the credits they actually consume \newline % Row Count 13 (+ 4) • For a multi-cluster warehouse, the number of credits billed is calculated based on the warehouse size and the number of clusters that run within the time period \newline % Row Count 17 (+ 4) {\bf{Data Loading}} \newline % Row Count 18 (+ 1) • Data loading performance doesn't necessarily improved by increasing the size of a warehouse. It is influenced more by the number of files being loaded (and the size of each file) than the size of the warehouse. \newline % Row Count 23 (+ 5) • Warehouse size can impact the amount of time required to execute queries especially for more complex queries. Larger warehouses have more compute resources available to process queries but not necessarily faster for small, basic queries. \newline % Row Count 28 (+ 5) {\bf{Query Processing and Concurrency}} \newline % Row Count 29 (+ 1) • No. of queries that a warehouse can concurrently process is determined by the size and complexity of each query \newline % Row Count 32 (+ 3) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Virtual Warehouses (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{• If queries are queuing more than desired, create another warehouse and redirect the queries to it manually. Resizing a warehouse can enable limited scaling for query concurrency but this is mainly intended for improving query performance. \newline % Row Count 5 (+ 5) {\bf{Multi-cluster warehouses}} \newline % Row Count 6 (+ 1) • Multi-cluster warehouses are recommended to enable fully automated scaling for concurrency \newline % Row Count 8 (+ 2) • Only available for Enterprise Edition (or higher) \newline % Row Count 10 (+ 2) • Up to 10 clusters \newline % Row Count 11 (+ 1) • Support all same properties and actions as single warehouse \newline % Row Count 13 (+ 2) {\emph{Mode:}} \newline % Row Count 14 (+ 1) • Maximized: same value for both max and min \# clusters \newline % Row Count 16 (+ 2) - effective for statically controlling the available compute resources \newline % Row Count 18 (+ 2) • Auto-scale: different values for max and min \# clusters \newline % Row Count 20 (+ 2) - dynamically manage the load based on scaling policy that determines when automatically starting or shutting down additional clusters. \newline % Row Count 23 (+ 3) \textgreater{} NOTE: Multi-cluster warehouses: best for scaling resources to improve concurrency for users/queries. Resizing the warehouse: best for improving the performance of slow-running queries or data loading \newline % Row Count 28 (+ 5) {\bf{Scale Up vs. Scale Out}} \newline % Row Count 29 (+ 1) • Scale up: resizing a warehouse. \newline % Row Count 30 (+ 1) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Virtual Warehouses (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{• Scale out: adding clusters to a multi-cluster warehouse (requires Snowflake Enterprise Edition or higher).% Row Count 3 (+ 3) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Warehouse Size}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/jia-yan_1677930685_Screenshot 2023-03-04 at 11.49.13 AM.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{Larger warehouse sizes 5X-Large and 6X-Large are generally available in all Amazon Web Services (AWS) regions, and are in preview in the US Government regions (requires FIPS support on ARM) and Azure regions} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{0.936 cm} x{2.088 cm} x{2.088 cm} x{2.088 cm} } \SetRowColor{DarkBackground} \mymulticolumn{4}{x{8.4cm}}{\bf\textcolor{white}{Scaling Policy (Auto Mode)}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Policy}} & {\bf{Strategy}} & {\bf{Starts…}} & {\bf{Shuts Down..}} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \seqsplit{Standard} \seqsplit{(default)} & Favours starting additional clusters over conserving credits & First cluster will start \seqsplit{immediately} when one or more queries are detected in the queue. Each successive clusters waits to start 20 seconds after the prior one has started. & After 2-3 \seqsplit{consecutive} successful checks (performed at 1 minute intervals), which determine whether the load on the \seqsplit{least-loaded} cluster could be \seqsplit{redistributed} \tn % Row Count 18 (+ 16) % Row 2 \SetRowColor{LightBackground} \seqsplit{Economy} & Favours keeping running clusters \seqsplit{fully-loaded} rather than starting additional clusters & Only if the system estimates there's enough query load to keep the cluster busy for at least 6 minutes & After 5-6 \seqsplit{consecutive} successful checks (performed at 1 minute intervals), which determine whether the load on the \seqsplit{least-loaded} cluster could be \seqsplit{redistributed} \tn % Row Count 33 (+ 15) \hhline{>{\arrayrulecolor{DarkBackground}}----} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Snowpark-optimized Warehouses}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Provide 16x memory per node compared to a standard Snowflake VW \newline % Row Count 2 (+ 2) • Recommended for workloads that have large memory requirements: ML training use cases using a stored procedure on a single VW node \newline % Row Count 5 (+ 3) • Benefits Snowpark workloads utilizing UDF or UDTF \newline % Row Count 7 (+ 2) • Snowpark-optimized warehouses are not supported on X-Small or SMALL warehouse sizes \newline % Row Count 9 (+ 2) • Snowpark-optimized warehouses are available in all regions across AWS, Azure, and Google Cloud.% Row Count 11 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Caching}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{1. {\bf{Result Cache (Service Layer)}}: Holds the results of every query executed in the past 24 hours. These are available across VWs, so query results returned to any user who executes the same query, provided the underlying data has not changed. \newline % Row Count 5 (+ 5) 2. {\bf{Local Disk Cache (VMs)}}: Used to cache data used by SQL queries. Whenever data is needed for a given query it's retrieved from the Remote Disk storage, and cached in SSD and memory. \newline % Row Count 9 (+ 4) 3. {\bf{Remote Disk (DB Storage)}}: Holds the long term storage. This level is responsible for data resilience. \newline % Row Count 12 (+ 3) {\bf{\textgreater{} NOTE: Results are retained for 24 hours, but the clock is reset every time the query is re-executed, up to a limit of 30 days, after which results query the remote disk.}} \newline % Row Count 16 (+ 4) {\emph{System Performance Tuning Best Practice}} \newline % Row Count 17 (+ 1) • {\bf{Auto-Suspend}}: Snowflake will auto-suspend a virtual warehouse with the SSD cache after 10 minutes of idle time by default. Best to leave this setting alone. \newline % Row Count 21 (+ 4) • {\bf{Scale up for large data volumes}}: Have a sequence of large queries + massive (multi-terabyte) size data volumes - scale up to improve query performance \newline % Row Count 25 (+ 4) • {\bf{Scale down - but not too soon}}: Tune the warehouse size dynamically (once large task has completed), but don't keep adjusting it, or you'll lose the benefit.% Row Count 29 (+ 4) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Databases, Tables \& Views}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Micro-partitions}} \newline % Row Count 1 (+ 1) • All data in Snowflake tables is automatically divided into micro-partitions which contains between 50 MB and 500 MB of uncompressed data \newline % Row Count 4 (+ 3) • Tables are partitioned using the ordering of the data as it is inserted/loaded. \newline % Row Count 6 (+ 2) {\emph{Benefits of Mico-partitions}} \newline % Row Count 7 (+ 1) • Don't need to be explicitly defined beforehand/maintained by users (derived automatically) \newline % Row Count 9 (+ 2) • Small in size which enables efficient DML and fine-grained pruning for faster queries \newline % Row Count 11 (+ 2) • Can overlap in their range of values, which, combined with their uniformly small size, helps prevent skew. \newline % Row Count 14 (+ 3) • Columns are stored independently (columnar storage) which allows efficient scanning of individual columns \newline % Row Count 17 (+ 3) \textgreater{} NOTE: Snowflake does not prune micro-partitions based on a predicate with a subquery, even if the subquery results in a constant \newline % Row Count 20 (+ 3) {\bf{Data Clustering}} \newline % Row Count 21 (+ 1) • Clustering metadata that is recorded for each micro-partition created, is then utilised to avoid unnecessary scanning of micro-partitions during querying \newline % Row Count 25 (+ 4) {\emph{Actions performed for queries on the table}} \newline % Row Count 26 (+ 1) 1. First, prune micro-partitions that are not needed for the query. \newline % Row Count 28 (+ 2) 2. Then, prune by column within the remaining micro-partitions. \newline % Row Count 30 (+ 2) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Databases, Tables \& Views (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\emph{Clustering Information Maintained}} \newline % Row Count 1 (+ 1) • The total \# micro-partitions \newline % Row Count 2 (+ 1) • \# micro-partitions containing values that overlap with each other (in a specified subset of table columns). \newline % Row Count 5 (+ 3) • The depth of the overlapping micro-partitions. \newline % Row Count 7 (+ 2) {\bf{Clustering Key}} \newline % Row Count 8 (+ 1) As DML occurs on large tables, the data might no longer cluster optimally. Snowflake designates one/more table columns as a clustering key to improve clustering of the underlying table micro-partitions automatically \newline % Row Count 13 (+ 5) {\emph{Use clustering key when:}} \newline % Row Count 14 (+ 1) • Require the fastest possible response times, regardless of cost. \newline % Row Count 16 (+ 2) • Improved query performance offsets the credits required to cluster and maintain the table. \newline % Row Count 18 (+ 2) • Queries on the table are running slower than expected \newline % Row Count 20 (+ 2) • The clustering depth for the table is large. \newline % Row Count 21 (+ 1) \textgreater{} NOTE: Clustering is generally most cost-effective for tables that are queried frequently and do not change frequently% Row Count 24 (+ 3) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Comparison of Table Types}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/jia-yan_1678009135_Screenshot 2023-03-05 at 9.28.34 AM.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{• After creation, transient/temporary tables cannot be converted to any other table type. \newline • The Fail-safe period is not configurable for any table type. \newline • Transient and temporary tables have no Fail-safe period (no additional data storage charges beyond the Time Travel retention period) \newline • Transient tables: a good option for managing the cost of very large transitory data but the data in these tables cannot be recovered after the Time Travel retention period passes.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Search Optimization Service}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{• improve the performance of certain types of lookup and analytical queries that use an extensive set of predicates for filtering \newline % Row Count 3 (+ 3) • Features that available (Enterprise Ed./Higher) \newline % Row Count 5 (+ 2) - Column configuration \newline % Row Count 6 (+ 1) - Support for substring + REGEX searches \newline % Row Count 7 (+ 1) - Support for fields in VARIANT, OBJECT and ARRAY columns \newline % Row Count 9 (+ 2) - Support for geospatial functions with GEOGRAPHY objects \newline % Row Count 11 (+ 2) - Selective point lookup queries on tables (returns only small \# of distinct rows) \newline % Row Count 13 (+ 2) • One of the ways to optimize query performance (others: clustering a table \& create materialised views - cluster/unclustered) \newline % Row Count 16 (+ 3) \textgreater{}\textgreater{} {\bf{Works best to improve the performance of a query when the table is frequently queried on columns other than the primary cluster key}} \newline % Row Count 19 (+ 3) {\emph{Queries that benefited from SO}} \newline % Row Count 20 (+ 1) • runs for a few seconds or longer \newline % Row Count 21 (+ 1) • at least one of the columns accessed through query filter operation has at least 100k distinct values \newline % Row Count 24 (+ 3) • Equality or IN predicates; Substring \& REGEX, etc. \newline % Row Count 26 (+ 2) {\emph{Not supported by SO}} \newline % Row Count 27 (+ 1) • External tables, materialised views, columns defined with a collate clause, col concatenation, analytical expressions, cast on table columns \newline % Row Count 30 (+ 3) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Search Optimization Service (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\emph{Costs of SO service}} \newline % Row Count 1 (+ 1) • The service creates a search access path data structure that requires space \newline % Row Count 3 (+ 2) - storage cost depends on: the \# of distinct values in the table (size is approx. 1/4 of the original table's size); Worst case scenario = same size as original table \newline % Row Count 7 (+ 4) • Adding the service + maintaining it consumes additional resources \newline % Row Count 9 (+ 2) - higher cost when there is high churn (large values of data in the table change) - proportional to the amount of data ingested \newline % Row Count 12 (+ 3) \textgreater{}\textgreater{} {\bf{The costs are proportional to the \# of tables on which SO is enabled, the \# of distinct values in those tables; the amount of data that changes in these tables}}% Row Count 16 (+ 4) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Options for Optimising Query Performance}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{p{8.4cm}}{\vspace{1px}\centerline{\includegraphics[width=5.1cm]{/web/www.cheatography.com/public/uploads/jia-yan_1678647207_Screenshot 2023-03-12 at 6.33.13 PM.png}}} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Materialised Views}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{• Like a view that is frozen in place but when changes are detected, Snowflakes will refresh it automatically \newline % Row Count 3 (+ 3) • It's designed to improve query performance for workloads composed of common \& repeated query patterns. \newline % Row Count 6 (+ 3) • It incurs additional costs. \newline % Row Count 7 (+ 1) {\emph{When to use it?}} \newline % Row Count 8 (+ 1) • Query results contain a small \# rows and/or columns relative to the base table \newline % Row Count 10 (+ 2) • Query results contain results that require significant processing \newline % Row Count 12 (+ 2) • Query is on an external table \newline % Row Count 13 (+ 1) • The view's base table does not change frequently \newline % Row Count 15 (+ 2) {\emph{Advantages of Materialised Views}} \newline % Row Count 16 (+ 1) • improve performance of queries that use the same subquery results repeatedly \newline % Row Count 18 (+ 2) • Background service updates the materialised view automatically after changes are made to the base table \newline % Row Count 21 (+ 3) • Data accessed through materialised views is always current \newline % Row Count 23 (+ 2) \textgreater{}\textgreater{} {\bf{We can't put a materialised view DIRECTLY on top of staged data.But, if we put an External Table in bt. them, we CAN put a Materialised view over staged data}} \newline % Row Count 27 (+ 4) \textgreater{}\textgreater{} {\bf{INFORMATION\_SCHEMA.VIEWS does not show materialized views. Materialized views are shown by INFORMATION\_SCHEMA.TABLES}}% Row Count 30 (+ 3) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Loading Data}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Best Practices in File Sizing}} \newline % Row Count 1 (+ 1) • No. of load operations that run in parallel {\bf{shouldn't be more}} than no. of data files to be loaded. Aim to have data files {\bf{\textasciitilde{}100-250 MB (or larger)}} in size compressed to optimise loading - same applies to Snowpipe \newline % Row Count 6 (+ 5) • Loading very large files (e.g. 100 GB or larger) is not recommended. Use ON\_ERROR copy option value should you need to do so. \newline % Row Count 9 (+ 3) • Aborting could cause delays \& credits wastage. Risking no portion of the file to be committed if data loading takes more than 24 hours. \newline % Row Count 12 (+ 3) • Smaller files, smaller processing overhead. Split large files by line to avoid records that span chunks \newline % Row Count 15 (+ 3) • VARIANT data type (Semi-structured data) has a 16 MB size limit on individual rows \newline % Row Count 17 (+ 2) • Snowpipe cost involves an overhead to manage files in the load queue (staging data) and resource consumption (loading data). \newline % Row Count 20 (+ 3) • Creating a new data file once per minute as best practice as it provides good balance between cost (i.e. resources spent on Snowpipe queue management and the actual load) and performance (i.e. load latency) \newline % Row Count 25 (+ 5) {\bf{Best Practices in File Formats}} \newline % Row Count 26 (+ 1) {\emph{Delimited Text Files }} \newline % Row Count 27 (+ 1) • UTF-8 is the default character set \newline % Row Count 28 (+ 1) • Fields that contain delimiter characters/carriage returns should be enclosed in quotes \newline % Row Count 30 (+ 2) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Loading Data (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{• Consistent number of columns in each row \newline % Row Count 1 (+ 1) {\emph{Semi-structured Data Files}} \newline % Row Count 2 (+ 1) • Extract semi-structured data elements containing "null" values into relational columns before loading them, else set STRIP\_NULL\_VALUES to TRUE if the "null" indicates only missing values \newline % Row Count 6 (+ 4) • Ensure each unique element stores values of a single native data type (string or number) \newline % Row Count 8 (+ 2) {\emph{Numeric Data Files}} \newline % Row Count 9 (+ 1) • Avoid embedded characters, e.g. commas \newline % Row Count 10 (+ 1) • Number that includes a fractional component should be separated from the whole number portion by a decimal point \newline % Row Count 13 (+ 3) \textgreater{} NOTE: Snowflake checks temporal data values at load time. Invalid date, time, and timestamp values (e.g. 0000-00-00) produce an error \newline % Row Count 16 (+ 3) {\bf{Best Practices in organising data by path}} \newline % Row Count 17 (+ 1) • Both internal and external stage references can include a path (or prefix in AWS terminology) \newline % Row Count 19 (+ 2) • It's recommended to partition the data into logical paths that include identifying details such as geographical location or other source identifiers \newline % Row Count 23 (+ 4) • Narrow the path to the most granular level that includes your data for improved data load performance.% Row Count 26 (+ 3) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Bulk Loading}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Bulk Loading from a Local File System}} \newline % Row Count 1 (+ 1) COPY command is used to bulk load data from a local file system into tables using an internal (i.e. Snowflake-managed) stage using two steps: \newline % Row Count 4 (+ 3) 1. Upload (i.e. stage) one or more data files to a Snowflake stage (named internal stage or table/user stage) using the PUT command. \newline % Row Count 7 (+ 3) 2. Use the COPY INTO command to load the contents into table. \newline % Row Count 9 (+ 2) {\bf{NOTE: Virtual Warehouse must be running to execute this step as it provides the compute resources to perform the actual insertion of rows into the table}} \newline % Row Count 13 (+ 4) By default, each user and table in Snowflake is automatically allocated an internal stage for staging data files to be loaded. In addition, you can create named internal stages \newline % Row Count 17 (+ 4) 3 types of internal stages are supported: User, Table and Named \newline % Row Count 19 (+ 2) • User Stage \newline % Row Count 20 (+ 1) - reference using @\textasciitilde{} \newline % Row Count 21 (+ 1) - cannot be altered or dropped. \newline % Row Count 22 (+ 1) - do not support setting file format options, it must be specified as part of the COPY INTO command. \newline % Row Count 25 (+ 3) {\bf{NOT suitable}} when: \newline % Row Count 26 (+ 1) 1. Multiple users require access to the files. \newline % Row Count 27 (+ 1) 2. The current user does not have INSERT privileges on the tables the data will be loaded into. \newline % Row Count 29 (+ 2) • Table Stage \newline % Row Count 30 (+ 1) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Bulk Loading (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{- same name as the table; reference @\%tablename \newline % Row Count 1 (+ 1) - cannot be altered or dropped \newline % Row Count 2 (+ 1) - do not support transforming data while loading it \newline % Row Count 4 (+ 2) - must be the table owner (have the role with the OWNERSHIP privilege on the table) \newline % Row Count 6 (+ 2) {\bf{NOT suitable}} when: \newline % Row Count 7 (+ 1) 1. you need to copy the data in the files into multiple tables \newline % Row Count 9 (+ 2) • Named Stage \newline % Row Count 10 (+ 1) - provide the greatest degree of flexibility for data loading \newline % Row Count 12 (+ 2) - users with the appropriate privileges on the stage can load data into any table \newline % Row Count 14 (+ 2) - the privileges can be granted or revoked from roles. In addition, ownership of the stage can be transferred to another role. \newline % Row Count 17 (+ 3) - recommended when you plan regular data loads that could involve multiple users and/or tables.% Row Count 19 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Snowpipe}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Loads data from files according to the COPY statement defined in a referenced pipe (a named, first-class Snowflake object that contains a COPY statement) as soon as they are available in a stage. \newline % Row Count 4 (+ 4) • All data types are supported, including semi-structured data types such as JSON and Avro \newline % Row Count 6 (+ 2) • 2 mechanisms for detecting the staged files: \newline % Row Count 7 (+ 1) - Automating Snowpipe using cloud messaging \newline % Row Count 8 (+ 1) - Calling Snowpipe REST endpoints \newline % Row Count 9 (+ 1) • Generally loads older files first, but there is no guarantee that files are loaded in the same order they are staged \newline % Row Count 12 (+ 3) • Uses file loading metadata associated with each pipe object to prevent reloading the same files \newline % Row Count 14 (+ 2) • Difficult to estimate latency as many factors can affect Snowpipe loads, e.g. File formats and sizes, and the complexity of COPY statements (including SELECT statement used for transformations) can all impact the amount of time required \newline % Row Count 19 (+ 5) • Charges are calculated by per second/per core granularity \newline % Row Count 21 (+ 2) • Check usages/charges in 2 ways: \newline % Row Count 22 (+ 1) - Account -\textgreater{} Usage/Bill \newline % Row Count 23 (+ 1) - SQL -\textgreater{} Information Schema -\textgreater{} PIPE\_USAGE\_HISTORY% Row Count 25 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{x{1.368 cm} x{3.116 cm} x{3.116 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{8.4cm}}{\bf\textcolor{white}{Bulk vs. Continuous Loading}} \tn % Row 0 \SetRowColor{LightBackground} {\bf{Areas}} & {\bf{Bulk Data Loading}} & {\bf{Snowpipe}} \tn % Row Count 2 (+ 2) % Row 1 \SetRowColor{white} \seqsplit{Mechanism} & load batches of data in cloud storage/copy data files from local using COPY command & load small volume of data within minutes after files are staged to ensure near real time availability \tn % Row Count 9 (+ 7) % Row 2 \SetRowColor{LightBackground} \seqsplit{Compute} \seqsplit{Resources} & Requires a user-specified warehouse to execute COPY statements & \seqsplit{Snowflake-provided} resources (i.e. a serverless compute model) \tn % Row Count 13 (+ 4) % Row 3 \SetRowColor{white} \seqsplit{Authentication} & Security options supported by the client for authenticating and initiating a user session & When calling the REST endpoints: Requires key pair authentication with JSON Web Token (JWT) \tn % Row Count 19 (+ 6) % Row 4 \SetRowColor{LightBackground} Load \seqsplit{History} & Stored in the metadata of the target table for 64 days. Available upon completion of the COPY statement as the statement output & Stored in the metadata of the pipe for 14 days. Must be requested from Snowflake via a REST endpoint, SQL table function, or ACCOUNT\_USAGE view \tn % Row Count 28 (+ 9) % Row 5 \SetRowColor{white} \seqsplit{Transactions} & Always in a single transaction. Data is inserted into table alongside any other SQL statements submitted manually by users & Combined or split into a single or multiple transactions based on the number and size of the rows in each data file. Rows of partially loaded files (based on the ON\_ERROR copy option setting) can also be combined or split into one or more transactions \tn % Row Count 44 (+ 16) \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{x{1.368 cm} x{3.116 cm} x{3.116 cm} } \SetRowColor{DarkBackground} \mymulticolumn{3}{x{8.4cm}}{\bf\textcolor{white}{Bulk vs. Continuous Loading (cont)}} \tn % Row 6 \SetRowColor{LightBackground} Cost & The amount of time each virtual warehouse is active & According to the compute resources used in the Snowpipe warehouse while loading the files \tn % Row Count 6 (+ 6) \hhline{>{\arrayrulecolor{DarkBackground}}---} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Semi-structured Data}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{Data that does not conform to the standards of traditional structured data and have two main characteristics: nested data structures and lack of a fixed schema. \newline % Row Count 4 (+ 4) • Snowflake can import semi-structured data from JSON, Avro, ORC, Parquet, and XML formats and store it in data types such as VARIANT, ARRAY or OBJECT \newline % Row Count 8 (+ 4) - A VARIANT can contain any other data type, including an ARRAY or an OBJECT (used to build and store hierarchical data) \newline % Row Count 11 (+ 3) - An ARRAY or OBJECT can directly contain VARIANT (thus can indirectly contain any other data type, including itself) \newline % Row Count 14 (+ 3) \textgreater{} {\bf{NOTE: a Snowflake OBJECT corresponds to a "dictionary" or a "map". A Snowflake object is not an "object" in the sense of OOP. Curly braces indicate an OBJECT, which contains key-value pairs}} \newline % Row Count 19 (+ 5) • Semi-structured data can be stored in a single column or split into multiple columns. \newline % Row Count 21 (+ 2) {\emph{Loading Semi-structured Data}} \newline % Row Count 22 (+ 1) • Can explicitly specify all, some, or none of the structure when you load and store semi-structured data \newline % Row Count 25 (+ 3) • If your data is a set of key-value pairs, you can load it into a column of type OBJECT. \newline % Row Count 27 (+ 2) • If your data is an array, you can load it into a column of type ARRAY. \newline % Row Count 29 (+ 2) • If the data is complex or an {\bf{individual value requires more than about 16MB of storage space}}, then you can, for instance, split the data into multiple columns, and some of those columns can contain an explicitly-specified hierarchy of data types. \newline % Row Count 35 (+ 6) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Semi-structured Data (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{\textgreater{} NOTE: When we split the data across multiple columns, we may use detect-and-retrieve feature but it is currently limited to Apache Parquet, Apache Avro, and ORC files. \newline % Row Count 4 (+ 4) {\emph{Querying Semi-structured Data}} \newline % Row Count 5 (+ 1) • Operations that are supported: \newline % Row Count 6 (+ 1) - Accessing an element in an array. \newline % Row Count 7 (+ 1) - Retrieving a specified value from a key-value pair in an OBJECT. \newline % Row Count 9 (+ 2) - Traversing the levels of a hierarchy stored in a VARIANT. \newline % Row Count 11 (+ 2) • The query output is enclosed in double quotes because the query output is VARIANT, not VARCHAR. (Operators : and subsequent . and {[}{]} always return VARIANT values containing strings.) \newline % Row Count 15 (+ 4) • Flatten the array to retrieve all instances of a child element in a repeating array. \newline % Row Count 17 (+ 2) • Use GET/GET\_PATH function to extract a value from a VARIANT column% Row Count 19 (+ 2) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Security Principles}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Federated Authentication}} \newline % Row Count 1 (+ 1) User authentication is separated from user access through the use of one or more external entities that provide independent authentication of user credentials (e.g. Single Sign-On (SSO)) \newline % Row Count 5 (+ 4) • consists 2 components: Service provider (SP) and Identity provider (IdP) \newline % Row Count 7 (+ 2) - Snowflake servers as the SP; IdP provides these services to Snowflake: Creating and maintaining user credentials and other profile information + Authenticating users for SSO access to the SP. \newline % Row Count 11 (+ 4) • 2 Types of IdP: The native Snowflake support provided by Okta and ADFS, Snowflake supports using most SAML 2.0-compliant vendors as an IdP: Google G Suite, Microsoft Azure Active Directory, OneLogin \& Ping Identity PingOne \newline % Row Count 16 (+ 5) \textgreater{}\textgreater{} For a web-based IdP (e.g. Okta), closing the browser tab/window does not necessarily end the IdP session. If a user's IdP session is still active, they can still access Snowflake until the IdP session times out. \newline % Row Count 21 (+ 5) {\bf{Multi-Factor Authentication (MFA)}} \newline % Row Count 22 (+ 1) • MFA is enabled on a per-user basis; To use MFA, users must enroll themselves \newline % Row Count 24 (+ 2) • All users with the ACCOUNTADMIN role be required to use MFA is recommended. \newline % Row Count 26 (+ 2) • Duo Push authentication mechanism is used when a user is enrolled in MFA \newline % Row Count 28 (+ 2) \textgreater{}\textgreater{} MFA token caching can be combined with connection caching in federated single sign-on: ensure that the ALLOW\_ID\_TOKEN parameter is set to true in tandem with the \seqsplit{ALLOW\_CLIENT\_MFA\_CACHING} parameter. \newline % Row Count 33 (+ 5) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Security Principles (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Network Policy}} \newline % Row Count 1 (+ 1) • Allow restricting access to your account based on user IP address. It enables you to create an IP allowed list, as well as an IP blocked list, if desired. \newline % Row Count 5 (+ 4) • A network policy is not enabled until it is activated at the account or individual user level. \newline % Row Count 7 (+ 2) \textgreater{}\textgreater{} Only security administrators (i.e. users with the SECURITYADMIN role) or higher or a role with the global CREATE NETWORK POLICY privilege can create network policies. Ownership of a network policy can be transferred to another role. \newline % Row Count 12 (+ 5) • If a network policy is activated for an individual user, the user-level network policy takes precedence. \newline % Row Count 15 (+ 3) {\bf{Snowflake Session}} \newline % Row Count 16 (+ 1) • A session is independent of an identity provider (i.e. IdP) session \newline % Row Count 18 (+ 2) • After the idle session timeout (with a max 4 hours of inactivity), the user must authenticate to Snowflake again. \newline % Row Count 21 (+ 3) • A session policy can modify the idle session timeout period \newline % Row Count 23 (+ 2) • Unset the session policy first and then set the new session policy to replace a session policy that is already set for an account or user \newline % Row Count 26 (+ 3) • The classic web interface tracks user activity and sends a heartbeat to an internal Snowflake monitor. This heartbeat recording is sent every three minutes and is not configurable. \newline % Row Count 30 (+ 4) } \tn \end{tabularx} \par\addvspace{1.3em} \vfill \columnbreak \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Security Principles (cont)}} \tn \SetRowColor{white} \mymulticolumn{1}{x{8.4cm}}{{\bf{Access Control}} \newline % Row Count 1 (+ 1) • Access control privileges determine who can access and perform operations on specific objects in Snowflake \newline % Row Count 4 (+ 3) • Discretionary Access Control (DAC): Each object has an owner -\textgreater{} grant access to that object. \newline % Row Count 6 (+ 2) • Role-based Access Control (RBAC): Access privileges are assigned to roles, which are in turn assigned to users. \newline % Row Count 9 (+ 3) \textgreater{}\textgreater{} A role owner (i.e. the role that has the OWNERSHIP privilege on the role) does not inherit the privileges of the owned role. Privilege inheritance is only possible within a role hierarchy. \newline % Row Count 13 (+ 4) • System-defined roles are created with privileges related to account-management. As a best practice, it is not recommended to mix account-management privileges and entity-specific privileges in the same role.% Row Count 18 (+ 5) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}