\documentclass[10pt,a4paper]{article}

% Packages
\usepackage{fancyhdr}           % For header and footer
\usepackage{multicol}           % Allows multicols in tables
\usepackage{tabularx}           % Intelligent column widths
\usepackage{tabulary}           % Used in header and footer
\usepackage{hhline}             % Border under tables
\usepackage{graphicx}           % For images
\usepackage{xcolor}             % For hex colours
%\usepackage[utf8x]{inputenc}    % For unicode character support
\usepackage[T1]{fontenc}        % Without this we get weird character replacements
\usepackage{colortbl}           % For coloured tables
\usepackage{setspace}           % For line height
\usepackage{lastpage}           % Needed for total page number
\usepackage{seqsplit}           % Splits long words.
%\usepackage{opensans}          % Can't make this work so far. Shame. Would be lovely.
\usepackage[normalem]{ulem}     % For underlining links
% Most of the following are not required for the majority
% of cheat sheets but are needed for some symbol support.
\usepackage{amsmath}            % Symbols
\usepackage{MnSymbol}           % Symbols
\usepackage{wasysym}            % Symbols
%\usepackage[english,german,french,spanish,italian]{babel}              % Languages

% Document Info
\author{melissamlwong}
\pdfinfo{
  /Title (awk-one-liners-for-fasta-manipulation-version-1-0.pdf)
  /Creator (Cheatography)
  /Author (melissamlwong)
  /Subject (Awk one-liners for FASTA manipulation version 1.0 Cheat Sheet)
}

% Lengths and widths
\addtolength{\textwidth}{6cm}
\addtolength{\textheight}{-1cm}
\addtolength{\hoffset}{-3cm}
\addtolength{\voffset}{-2cm}
\setlength{\tabcolsep}{0.2cm} % Space between columns
\setlength{\headsep}{-12pt} % Reduce space between header and content
\setlength{\headheight}{85pt} % If less, LaTeX automatically increases it
\renewcommand{\footrulewidth}{0pt} % Remove footer line
\renewcommand{\headrulewidth}{0pt} % Remove header line
\renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit
% This two commands together give roughly
% the right line height in the tables
\renewcommand{\arraystretch}{1.3}
\onehalfspacing

% Commands
\newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour
\newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols
\newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns
\newcommand{\tn}{\tabularnewline} % Required as custom column type in use

% Font and Colours
\definecolor{HeadBackground}{HTML}{333333}
\definecolor{FootBackground}{HTML}{666666}
\definecolor{TextColor}{HTML}{333333}
\definecolor{DarkBackground}{HTML}{559BA3}
\definecolor{LightBackground}{HTML}{F4F8F9}
\renewcommand{\familydefault}{\sfdefault}
\color{TextColor}

% Header and Footer
\pagestyle{fancy}
\fancyhead{} % Set header to blank
\fancyfoot{} % Set footer to blank
\fancyhead[L]{
\noindent
\begin{multicols}{3}
\begin{tabulary}{5.8cm}{C}
    \SetRowColor{DarkBackground}
    \vspace{-7pt}
    {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent
        \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}}
    }
\end{tabulary}
\columnbreak
\begin{tabulary}{11cm}{L}
    \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Awk one-liners for FASTA manipulation version 1.0 Cheat Sheet}}}} \\
    \normalsize{by \textcolor{DarkBackground}{melissamlwong} via \textcolor{DarkBackground}{\uline{cheatography.com/22270/cs/4523/}}}
\end{tabulary}
\end{multicols}}

\fancyfoot[L]{ \footnotesize
\noindent
\begin{multicols}{3}
\begin{tabulary}{5.8cm}{LL}
  \SetRowColor{FootBackground}
  \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}}  \\
  \vspace{-2pt}melissamlwong \\
  \uline{cheatography.com/melissamlwong} \\
  \end{tabulary}
\vfill
\columnbreak
\begin{tabulary}{5.8cm}{L}
  \SetRowColor{FootBackground}
  \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}}  \\
   \vspace{-2pt}Published 2nd July, 2015.\\
   Updated 12th May, 2016.\\
   Page {\thepage} of \pageref{LastPage}.
\end{tabulary}
\vfill
\columnbreak
\begin{tabulary}{5.8cm}{L}
  \SetRowColor{FootBackground}
  \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}}  \\
  \SetRowColor{white}
  \vspace{-5pt}
  %\includegraphics[width=48px,height=48px]{dave.jpeg}
  Measure your website readability!\\
  www.readability-score.com
\end{tabulary}
\end{multicols}}




\begin{document}
\raggedright
\raggedcolumns

% Set font size to small. Switch to any value
% from this page to resize cheat sheet text:
% www.emerson.emory.edu/services/latex/latex_169.html
\footnotesize % Small font.


\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Introduction}}  \tn
\SetRowColor{white}
\mymulticolumn{1}{x{17.67cm}}{This cheatsheet contains 10 useful AWK one-liners for manipulation of FASTA files. It is created as part of a series to help graduate students and biologists in learning some simple programming scripts. Each oneliner is usually accompanied by additional comments which start with a hash ("\#"). Runnable codes is available on \seqsplit{http://code.runnable.com/VZsPvrVQ5JkyE\_ru/awk-one-liners-for-fasta-manipulation-for-shell-bash-and-bioinformatics} \newline % Row Count 9 (+ 9)
Author: Melissa M.L. Wong; Date created: 1 July 2015; Date last modified:6 July 2015; Email: melissawongukm@gmail.com \newline % Row Count 12 (+ 3)
FASTA format is a text-based format for representing either nucleotide sequences or peptide sequences, in which nucleotides or amino acids are represented using single-letter codes. A fasta sequence must start with an arrow ("\textgreater{}"), followed by its name and a newline character ("\textbackslash{}n"), and lastly its sequence which can span multiple lines.% Row Count 19 (+ 7)
} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{1. To find sequences with matching name}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{if (\$1\textasciitilde{}/name/) print "\textgreater{}"\$0\}' file.fa} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{2. To extract sequences using a list}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR==FNR\{a{[}\$1{]}++\}NR\textgreater{}FNR\{if (\$1 in a \&\& \$0!="") printf "\textgreater{}\%s",\$0\}' list file.fa \newline \#The names in the list must start with "\textgreater{}" and each name is separated by a newline ("\textbackslash{}n")} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{3. To join multiple lines into single line}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; print "\textgreater{}"\$1"\textbackslash{}n"seq\}' file.fa \newline \#Single line sequence is desirable when a sequence is long and spans many lines. Furthermore, single line sequence is much easier to be manipulated using AWK oneliners as showed in the next few examples.} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{4. To print specified sequence region}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{17.67cm}}{\#To print the sequence starting from position 1 until 2213 \newline awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; print "\textgreater{}"\$1"\textbackslash{}n"substr(seq,1,2213)\}' file.fa \newline \#To print sequence starting from position 399 until 704 \newline awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; print "\textgreater{}"\$1"\textbackslash{}n"substr(seq,399,704-399+1)\}' file.fa \newline \#To print sequence with matching name from position 399 until 704 \newline awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; if (\$1\textasciitilde{}/name/) print "\textgreater{}"\$1"\textbackslash{}n"substr(seq,399,704-399+1)\}' file.fa \newline \#Useful to print sequence region when given start position and stop position or length} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{5. To reformat into 100 characters per line}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i;a{[}\$1{]}=seq;b{[}\$1{]}=length(seq)\}END\{for (i in a) \{k=sprintf("\%d", (b{[}i{]}/100)+1); printf "\textgreater{}\%s\textbackslash{}n",i;for (j=1;j\textless{}=int(k);j++) printf "\%s\textbackslash{}n", substr(a{[}i{]},1+(j-1)*100,100)\}\}' fasta.txt} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{6. To substitute nucleotide sequences}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{17.67cm}}{\#To substitute small letter with capital letter \newline awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{printf "\textgreater{}\%s\textbackslash{}n",\$1;for (i=2;i\textless{}=NF;i++) \{gsub(/c/,"C",\$i);gsub(/a/,"A",\$i);gsub(/g/,"G",\$i);gsub(/t/,"T",\$i); printf "\%s\textbackslash{}n",\$i\}\}' file.fa} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{7. To convert DNA to RNA}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{printf "\textgreater{}\%s\textbackslash{}n",\$1;for (i=2;i\textless{}=NF;i++) \{gsub(/T/,"U",\$i); printf "\%s\textbackslash{}n",\$i\}\}' file.fa} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{8. To summarize sequence content}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n";print "name\textbackslash{}tA\textbackslash{}tC\textbackslash{}tG\textbackslash{}tT\textbackslash{}tN\textbackslash{}tlength\textbackslash{}tGC\%"\}NR\textgreater{}1\{sumA=0;sumT=0;sumC=0;sumG=0;sumN=0;seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; k=length(seq); for (i=1;i\textless{}=k;i++) \{if (substr(seq,i,1)=="T") sumT+=1; else if (substr(seq,i,1)=="A") sumA+=1; else if (substr(seq,i,1)=="G") sumG+=1; else if (substr(seq,i,1)=="C") sumC+=1; else if (substr(seq,i,1)=="N") sumN+=1\}; print \$1"\textbackslash{}t"sumA"\textbackslash{}t"sumC"\textbackslash{}t"sumG"\textbackslash{}t"sumT"\textbackslash{}t"sumN"\textbackslash{}t"k"\textbackslash{}t"(sumC+sumG)/k*100\}' file.fa \newline \#Calculate number of each nucleotide, total length and GC content} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{9. To reverse complement nucleotide sequences}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n";a{[}"T"{]}="A";a{[}"A"{]}="T";a{[}"C"{]}="G";a{[}"G"{]}="C";a{[}"N"{]}="N"\}NR\textgreater{}1\{for (i=2;i\textless{}=NF;i++) \seqsplit{seq=seq""\$i;for(i=length(seq);i}!=0;i-{}-) \{k=substr(seq,i,1);x=x a{[}k{]}\}; printf "\textgreater{}\%s\textbackslash{}n\%s",\$1,x\}' file.fa \newline \#This will produce a single line sequence} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}

\begin{tabularx}{17.67cm}{X}
\SetRowColor{DarkBackground}
\mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{10. To convert FASTQ to FASTA format}}  \tn
\SetRowColor{LightBackground}
\mymulticolumn{1}{x{17.67cm}}{awk 'NR\%4==1\{print "\textgreater{}"substr(\$0,2)\}NR\%4==2\{print \$0\}' file.fq \newline \#print first and second line of every four lines. Replace the first character of the first line with "\textgreater{}".} \tn 
\hhline{>{\arrayrulecolor{DarkBackground}}-}
\end{tabularx}
\par\addvspace{1.3em}



\end{document}