\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{melissamlwong} \pdfinfo{ /Title (awk-one-liners-for-blast-results-manipulation.pdf) /Creator (Cheatography) /Author (melissamlwong) /Subject (Awk one-liners for blast results manipulation Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{5E56A3} \definecolor{LightBackground}{HTML}{F4F4F9} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Awk one-liners for blast results manipulation Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{melissamlwong} via \textcolor{DarkBackground}{\uline{cheatography.com/22270/cs/5331/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}melissamlwong \\ \uline{cheatography.com/melissamlwong} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Published 5th October, 2015.\\ Updated 12th May, 2016.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{Introduction}} \tn \SetRowColor{white} \mymulticolumn{1}{x{17.67cm}}{This cheatsheet contains 10 useful AWK one-liners for tab delimited blast results. It is created as part of a series to help graduate students and biologists in learning some simple programming scripts. Each oneliner is usually accompanied by additional comments which start with a hash ("\#"). Runnabble codes is available on \seqsplit{http://code.runnable.com/VfItWNXUYTcrUkwn/10-awk-one-liners-for-blast-results-manipulation-for-shell-bash-and-bioinformatics} \newline % Row Count 10 (+ 10) Author: Melissa M.L. Wong; Date created: 6 Aug 2015; Date last modified:21 April 2016; Email: melissawongukm@gmail.com \newline % Row Count 13 (+ 3) Tab delimited blast results is a text-based files to show pairwise alignment between two sequences. It is generated using the option "-outfmt 6" or "-m 8". Each column is separated by a tab and represents queryId(\$1), subjectId(\$2), percIdentity(\$3), alnLength(\$4), mismatchCount(\$5), gapOpenCount(\$6), queryStart(\$7), queryEnd(\$8), subjectStart(\$9), subjectEnd(\$10), eValue(\$11) and bitScore(\$12) respectively% Row Count 22 (+ 9) } \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{1. To filter alignment}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk '\$1\textasciitilde{}/Medtr1g006460.1/' temp.blast \#matching query name \newline awk '\$2\textasciitilde{}/Medtr0/' temp.blast \#matching reference name \newline awk '\$12\textgreater{}=1000' temp.blast \#score \newline awk '\$3\textgreater{}=80' temp.blast \#identity percentage \newline awk '\$11\textless{}1e-30' temp.blast \#e-value} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{2. To filter all against all blast results}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{\#method 1 - remove blast results of the same sequence and apply filtering \newline blastn -task megablast -db database1 -query temp.fa -evalue 1E-10 -outfmt 6 | awk '\$1!=\$2 \&\& \$3\textgreater{}=40 \&\& \$4\textgreater{}=300' \newline \#method 2 - remove blast results of the same sequence and apply filtering \newline blastn -task megablast -db database1 -query temp.fa -evalue 1E-10 -outfmt 6 | awk '\{split(\$1,a,"."); split(\$1,b,"."); if (a{[}1{]}!=b{[}1{]} \&\& \$3\textgreater{}=40 \&\& \$4\textgreater{}=300) print \}' \newline \#method 3 - remove redundant alignments. Any alignment in all-against-all blast can appear twice as seq1\textbackslash{}tseq2 and seq2\textbackslash{}tseq1. Both alignments can sometimes vary in length by 1-2 bp, however, they always share the same score. \newline awk '\{c=\$1"\textbackslash{}t"\$2"\textbackslash{}t"\$12 ; b= \$2"\textbackslash{}t"\$1"\textbackslash{}t"\$12; if (\$1!=\$2 \&\& a{[}c{]}==0 \&\& a{[}b{]}==0) a{[}\$1"\textbackslash{}t"\$2"\textbackslash{}t"\$12{]}=\$0\}END\{for (i in a) print a{[}i{]}\}' temp.txt \textgreater{} temp.blast \#not so working well} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{3. To filter alignments based on sequence length}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{\#method 1 - calculate sequence length, calculate percentage of alignment length against sequence length, filter blast file \newline awk 'BEGIN\{RS="\textgreater{}";FS="\textbackslash{}n"\}NR\textgreater{}1\{seq="";for (i=2;i\textless{}=NF;i++) seq=seq""\$i; print \$1"\textbackslash{}t"length(seq)\}' temp.fa \textgreater{} len1 \newline awk 'NR==FNR\{a{[}NR{]}=\$1"\textbackslash{}t"\$2"\textbackslash{}t"\$4;d{[}NR{]}=\$0;sum+=1\}NR\textgreater{}FNR\{b{[}\$1{]}=\$2\}END\{for (i=1;i\textless{}=sum;i++) \{split(a{[}i{]},c,"\textbackslash{}t"); if (c{[}3{]}/b{[}c{[}1{]}{]}\textgreater{}=0.8 \&\& c{[}3{]}/b{[}c{[}2{]}{]}\textgreater{}=0.8) print d{[}i{]}\}\}' temp.blast len1 len1 \newline \#method 2 - if length information is included in fasta header \newline awk '\{split(\$1,a,"\_"); split(\$1,b,"\_"); c=a{[}2{]};d=b{[}2{]}; if (\$4/c\textgreater{}=0.8 \&\& \$4/d\textgreater{}=0.8) print \$0\}' temp.blast \#if length in header and separated by "\_"} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{4. To count the number of queries}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk '! a{[}\$1{]}++' temp.blast | wc -l \newline awk '\{a{[}\$1{]}++\}END\{for (i in a) sum+=1; print sum\}' temp.blast \#equivalent script but faster} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{5. To count the number of alignments per query}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk '\{a{[}\$1{]}++\}END\{for (i in a) print i"\textbackslash{}t"a{[}i{]}\}' temp.blast} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{6. To find best hit for a query}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{\#method 1 - Use the first alignment per sequence assuming the best hit is always listed first \newline awk '! a{[}\$1{]}++' temp.blast \newline \#method 2 - Use total score assuming each query can have multiple alignments to a reference sequence. In my opinion, this is the best way except in cases where multiple alignments to the same region of a pair of query and reference are reported. \newline awk '\{b{[}\$1{]}="0"; e{[}\$1{]}="";if (a{[}\$1,\$2{]}=="0") a{[}\$1,\$2{]}=\$12; else \{score=a{[}\$1,\$2{]}+\$12; a{[}\$1,\$2{]}=score\}\}END\{for (i in b) for (j in a) \{split(j,c,SUBSEP); if (c{[}1{]}==i \&\& a{[}j{]}\textgreater{}b{[}i{]}) \{b{[}i{]}=a{[}j{]};e{[}i{]}=c{[}2{]}\}\}; for (i in b) print i"\textbackslash{}t"e{[}i{]}"\textbackslash{}t"b{[}i{]}\}' temp.blast} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{7. To find reciprocal best hit for a query}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{\#An extension of the finding best hit script by making sure that a query is a reference's best hit and vice versa \newline awk '\{a{[}\$1{]}="0";b{[}\$1{]}="";c{[}\$2{]}="0";d{[}\$2{]}="";if (e{[}\$1,\$2{]}==0) e{[}\$1,\$2{]}=\$12; else \{score=e{[}\$1,\$2{]}+\$12; e{[}\$1,\$2{]}=score\}\}END\{for (i in a) for (j in e) \{split(j,f,SUBSEP); if (f{[}1{]}==i \&\& e{[}j{]}\textgreater{}a{[}i{]}) \{a{[}i{]}=e{[}j{]};b{[}i{]}=f{[}2{]}\}\}; for (i in c) for (j in e) \{split(j,f,SUBSEP); if (f{[}2{]}==i \&\& e{[}j{]}\textgreater{}c{[}i{]}) \{c{[}i{]}=e{[}j{]};d{[}i{]}=f{[}1{]}\}\}; for (i in b) if (b{[}i{]} in d \&\& d{[}b{[}i{]}{]}==i) print i"\textbackslash{}t"b{[}i{]}"\textbackslash{}t"a{[}i{]}"\textbackslash{}t"c{[}b{[}i{]}{]}\}' temp.blast \#need to debug} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{8. To extract one seqeunce}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk 'NR==FNR\{if (\$1\textasciitilde{}/Medtr1g006460.1/) a{[}\$1{]}++\}NR\textgreater{}FNR\{if (\$1 in a \&\& \$1!="") printf "\textgreater{}\%s\textbackslash{}n",\$0\}' RS="\textbackslash{}n" FS="\textbackslash{}t" temp.blast RS="\textgreater{}" FS="\textbackslash{}n" temp.fa} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{9. To reduce blast file size}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{\#replace unnecessary columns by replacing them with empty string. For example, we are only interested in the query name, reference name and score. \newline awk '\{print \$1"\textbackslash{}t"\$2"\textbackslash{}t\textbackslash{}t\textbackslash{}t\textbackslash{}t\textbackslash{}t\textbackslash{}t\textbackslash{}t\textbackslash{}t\textbackslash{}t\textbackslash{}t"\$12\}' temp.blast} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{17.67cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{17.67cm}}{\bf\textcolor{white}{10. To list all hits for each reference sequence}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{17.67cm}}{awk '\{a{[}\$1{]}++;b{[}\$1,\$2{]}++\}END\{for (i in a) \{printf "\%s", i; for (j in b) \{split(j,c,SUBSEP); if (c{[}1{]}==i) printf " \%s", c{[}2{]}\};printf "\textbackslash{}n"\}\}' temp.blast} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \end{document}