\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{melissamlwong} \pdfinfo{ /Title (awk-one-liners-for-fasta-manipulation-version-1-0.pdf) /Creator (Cheatography) /Author (melissamlwong) /Subject (Awk one-liners for FASTA manipulation version 1.0 Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{559BA3} \definecolor{LightBackground}{HTML}{F4F8F9} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Awk one-liners for FASTA manipulation version 1.0 Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{melissamlwong} via \textcolor{DarkBackground}{\uline{cheatography.com/22270/cs/4523/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}melissamlwong \\ \uline{cheatography.com/melissamlwong} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 2nd July, 2015.\\ Updated 12th May, 2016.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Introduction}} \tn \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{This cheatsheet contains 10 useful AWK one-liners for manipulation of FASTA files. It is created as part of a series to help graduate students and biologists in learning some simple programming scripts. Each oneliner is usually accompanied by additional comments which start with a hash ("\#"). Runnable codes is available on \seqsplit{http://code.runnable.com/VZsPvrVQ5JkyE\_ru/awk-one-liners-for-fasta-manipulation-for-shell-bash-and-bioinformatics} \newline % Row Count 9 (+ 9) Author: Melissa M.L. Wong; Date created: 1 July 2015; Date last modified:6 July 2015; Email: melissawongukm@gmail.com \newline % Row Count 12 (+ 3) FASTA format is a text-based format for representing either nucleotide sequences or peptide sequences, in which nucleotides or amino acids are represented using single-letter codes. A fasta sequence must start with an arrow ("\textgreater{}"), followed by its name and a newline character ("\textbackslash{}n"), and lastly its sequence which can span multiple lines.% Row Count 19 (+ 7) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{1. To find sequences with matching name}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{if (\$1\textasciitilde{}/name/) print "\textgreater{}"\$0\}' file.fa} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{2. To extract sequences using a list}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR==FNR\{a{[}\$1{]}++\}NR\textgreater{}FNR\{if (\$1 in a \&\& \$0!="") printf "\textgreater{}\%s",\$0\}' list file.fa \newline \#The names in the list must start with "\textgreater{}" and each name is separated by a newline ("\textbackslash{}n")} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{3. To join multiple lines into single line}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; print "\textgreater{}"\$1"\textbackslash{}n"seq\}' file.fa \newline \#Single line sequence is desirable when a sequence is long and spans many lines. Furthermore, single line sequence is much easier to be manipulated using AWK oneliners as showed in the next few examples.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{4. To print specified sequence region}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{\#To print the sequence starting from position 1 until 2213 \newline awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; print "\textgreater{}"\$1"\textbackslash{}n"substr(seq,1,2213)\}' file.fa \newline \#To print sequence starting from position 399 until 704 \newline awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; print "\textgreater{}"\$1"\textbackslash{}n"substr(seq,399,704-399+1)\}' file.fa \newline \#To print sequence with matching name from position 399 until 704 \newline awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; if (\$1\textasciitilde{}/name/) print "\textgreater{}"\$1"\textbackslash{}n"substr(seq,399,704-399+1)\}' file.fa \newline \#Useful to print sequence region when given start position and stop position or length} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{5. To reformat into 100 characters per line}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i;a{[}\$1{]}=seq;b{[}\$1{]}=length(seq)\}END\{for (i in a) \{k=sprintf("\%d", (b{[}i{]}/100)+1); printf "\textgreater{}\%s\textbackslash{}n",i;for (j=1;j\textless{}=int(k);j++) printf "\%s\textbackslash{}n", substr(a{[}i{]},1+(j-1)*100,100)\}\}' fasta.txt} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{6. To substitute nucleotide sequences}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{\#To substitute small letter with capital letter \newline awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{printf "\textgreater{}\%s\textbackslash{}n",\$1;for (i=2;i\textless{}=NF;i++) \{gsub(/c/,"C",\$i);gsub(/a/,"A",\$i);gsub(/g/,"G",\$i);gsub(/t/,"T",\$i); printf "\%s\textbackslash{}n",\$i\}\}' file.fa} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{7. To convert DNA to RNA}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{printf "\textgreater{}\%s\textbackslash{}n",\$1;for (i=2;i\textless{}=NF;i++) \{gsub(/T/,"U",\$i); printf "\%s\textbackslash{}n",\$i\}\}' file.fa} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{8. To summarize sequence content}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n";print "name\textbackslash{}tA\textbackslash{}tC\textbackslash{}tG\textbackslash{}tT\textbackslash{}tN\textbackslash{}tlength\textbackslash{}tGC\%"\}NR\textgreater{}1\{sumA=0;sumT=0;sumC=0;sumG=0;sumN=0;seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; k=length(seq); for (i=1;i\textless{}=k;i++) \{if (substr(seq,i,1)=="T") sumT+=1; else if (substr(seq,i,1)=="A") sumA+=1; else if (substr(seq,i,1)=="G") sumG+=1; else if (substr(seq,i,1)=="C") sumC+=1; else if (substr(seq,i,1)=="N") sumN+=1\}; print \$1"\textbackslash{}t"sumA"\textbackslash{}t"sumC"\textbackslash{}t"sumG"\textbackslash{}t"sumT"\textbackslash{}t"sumN"\textbackslash{}t"k"\textbackslash{}t"(sumC+sumG)/k*100\}' file.fa \newline \#Calculate number of each nucleotide, total length and GC content} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{9. To reverse complement nucleotide sequences}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n";a{[}"T"{]}="A";a{[}"A"{]}="T";a{[}"C"{]}="G";a{[}"G"{]}="C";a{[}"N"{]}="N"\}NR\textgreater{}1\{for (i=2;i\textless{}=NF;i++) \seqsplit{seq=seq""\$i;for(i=length(seq);i}!=0;i-{}-) \{k=substr(seq,i,1);x=x a{[}k{]}\}; printf "\textgreater{}\%s\textbackslash{}n\%s",\$1,x\}' file.fa \newline \#This will produce a single line sequence} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{10. To convert FASTQ to FASTA format}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk 'NR\%4==1\{print "\textgreater{}"substr(\$0,2)\}NR\%4==2\{print \$0\}' file.fq \newline \#print first and second line of every four lines. Replace the first character of the first line with "\textgreater{}".} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \end{document}