\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{Taissir Boukrouba (taissir2002)} \pdfinfo{ /Title (biopython-basics.pdf) /Creator (Cheatography) /Author (Taissir Boukrouba (taissir2002)) /Subject (Biopython Basics Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{1138B8} \definecolor{LightBackground}{HTML}{F0F2FA} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Biopython Basics Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{Taissir Boukrouba (taissir2002)} via \textcolor{DarkBackground}{\uline{cheatography.com/193976/cs/42971/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}Taissir Boukrouba (taissir2002) \\ \uline{cheatography.com/taissir2002} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 7th April, 2024.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{3} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Installing Biopython}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{pip install biopython \newline pip install -{}-upgrade biopython \newline \newline \# import the library \newline import Bio} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Creating Sequences}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from Bio.Seq import Seq \newline my\_seq = Seq("AATGCACGTTG")} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{To create a sequence we use the `Seq` function from `Bio` library} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Filling Sequences}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\# filling sequences \newline fragments = {[}Seq("GTAT"), Seq("TACT"){]} \newline filler = Seq("A"*3) \newline print(filler.join(fragments)) \newline \newline \#output : \newline \textgreater{}\textgreater{}\textgreater{} GTATAAATACT} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Slicing Sequences}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\# defining sequences \newline my\_seq = Seq("AAGTCCAGTGT") \newline my\_seq\_2 = Seq("AAAA") \newline \newline \# slicing sequences \newline print(my\_seq{[}1:6{]}) \newline print(my\_seq{[}0::2{]}) \newline \newline \# output : \newline \textgreater{}\textgreater{}\textgreater{} AGTCC \newline \textgreater{}\textgreater{}\textgreater{} AGCATT} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Slicing Sequences is the same as that of a python list ( we use `{[}{]}`)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Appending Sequences}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{my\_seq = Seq("AAGTCCAGTGT") \newline my\_seq\_2 = Seq("AAAA") \newline \newline \#appending sequences \newline print(my\_seq + my\_seq\_2) \newline \newline \# output \newline \textgreater{}\textgreater{}\textgreater{} AAGTCCAGTGTAAAA} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{Appending sequences is the same as appending strings in python} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Sequence Counting}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from Bio.Seq import Seq \newline \# creating a sequence \newline seq\_example = Seq("AGTACACTGGT") \newline \newline seq\_length = len(seq\_example) \newline occ = seq\_example.count("C") \newline \newline print("The length of the sequence is", len(seq\_example)) \newline print("The number of occurrences for nucleotide C is ", occ ) \newline \newline \#output : \newline The length of the sequence is 11 \newline The number of occurrences for nucleotide C is 2} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{This provides how to get the length of a sequence and the number of occurrences of a specific nucleotide} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Finding Sub-sequence Index}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{my\_seq = Seq("AAGTCCAGTGT") \newline index = my\_seq.find("GTC") \newline print(f"GTC index is \{index\}") \newline \newline \# output : \newline GTC index is 2} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{This returns the start index of the selected sub-sequence} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Reading Sequence Files}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from Bio import SeqIO \newline \newline records = \seqsplit{SeqIO.parse("sequence\_file.fasta"}, "fasta") \newline for record in records : \newline print(record.seq)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{We can also access other attributes from the records : \newline - `record.seq` : returns one sequence from list of records \newline - `record.id` : returns the identifier of the sequence \newline - `record.description` : returns the sequence description} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Writing Sequences into a file}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from Bio import SeqIO \newline \newline \# Define your sequence as a string \newline sequence = "ATCGATCGATCGATCGATC" \newline \newline \# Defining file name and format \newline filename = "my\_sequence.fasta" \newline format = "fasta" \newline \newline \# defining the sequence \newline seq = \seqsplit{SeqIO.SeqRecord(SeqIO.Seq(sequence1)}, \newline id="my\_id", description="My sequence description") \newline \newline \# Open the file for writing in text mode \newline with open(filename, "w") as file: \newline \# Create a SeqRecord object \newline record = SeqIO.SeqRecord(seq) \newline \# Write the record to the file using the specified format \newline SeqIO.write(record, file, format)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Converting Files}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{\# syntax \newline SeqIO.convert(inp\_file, inp\_format, outp\_file, outp\_format, alphabet=None) \newline \newline \#example \newline SeqIO.convert("sequence.gbk", "genbank", \seqsplit{"sequence\_converted.fasta"}, "fasta")} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{inp\_file : path to input file \newline inp\_format : input file format/extention \newline outp\_file : path to output file \newline outp\_format : output file format/extention \newline alphabet : specify the correct alphabet (DNA,RNA or Protein) to avoid conversion confusion} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Sequence Molecular Weight}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from Bio.SeqUtils import molecular\_weight \newline from Bio.Seq import Seq \newline \newline seq\_example = Seq("TGTACCCTGGT") \newline mw = \seqsplit{molecular\_weight(seq\_example)} \newline \newline print(mw) \newline \newline \#output : \newline \textgreater{}\textgreater{}\textgreater{} 3403.1577} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Molecular weight}} is a way to guess how heavy a tiny building block of life (like a protein or piece of DNA) is compared to a single carbon atom where the bigger the building blocks , the higher the molecular weight} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{GC-Content}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from Bio.SeqUtils import gc\_fraction \newline from Bio.Seq import Seq \newline \newline \# creating a sequence \newline seq\_example = Seq("AGATTCACTGGT") \newline gc\_content = \seqsplit{gc\_fraction(seq\_example)} \newline print(gc\_content) \newline \newline \# output : \newline \textgreater{}\textgreater{}\textgreater{} 0.41} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{G-C content}} refers to the percentage of {\bf{guanine (G)}} and {\bf{cytosine (C)}} molecules out of all the building blocks (called nucleotides) in a strand of DNA or RNA.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Reverse Complement}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from Bio.Seq import Seq \newline \newline \#creating a sequence \newline seq\_example = Seq("AGTACACTGGT") \newline print("Sequence is :",seq\_example) \newline \newline \# getting the reverse compliment \newline rev\_comp = \seqsplit{seq\_example.reverse\_complement()} \newline print("Reverse complement:", rev\_comp) \newline \newline \#output : \newline \textgreater{}\textgreater{}\textgreater{} Sequence is : AGTACACTGGT \newline \textgreater{}\textgreater{}\textgreater{} The reverse complement : ACCAGTGTACT} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Reverse complement}} of a DNA sequence is like a mirror image on the opposite strand. \newline \newline - {\bf{Reverse:}} Flips the order of the DNA letters (A, C, G, T) from left to right to right to left. \newline - {\bf{Complement:}} Swaps each letter according to its pair: A pairs with T, and C pairs with G.} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Transcription \& Translation}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{seq\_example = Seq("ATGAAGTTTTAG") \newline transc = \seqsplit{seq\_example.transcribe()} \newline print("Transcription:", transc) \newline \newline transl = seq\_example.translate() \newline print("Translation:", transl) \newline \newline \newline \#output : \newline \textgreater{}\textgreater{}\textgreater{} Transcription: AUGAAGUUUUAG \newline \textgreater{}\textgreater{}\textgreater{} Translation: MKF*} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{{\bf{Transcription and translation}} are the two main steps that turn the instructions in our genes (DNA) into the building blocks of life (proteins). \newline \newline - {\emph{Transcription}} : is going from DNA to RNA ( creating a copy ) \newline - {\emph{Translation}} : is going from RNA to Protein} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Accessing NCBI Database using esearch()}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from Bio import Entrez \newline \newline handle = \seqsplit{Entrez.esearch(db="nucleotide"}, term="BRCA1 gene", retmax=20) \newline record = Entrez.read(handle)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{- `db` : The name of the Entrez database to search ("nucleotide", "protein"....) \newline - `term` : The search term (e.g., gene name, protein ID ....) \newline - `retmode (str, optional)`: The format (return mode) to return results in (default: "xml"). \newline - `retmax (int, optional)`: Maximum number of IDs to return (default: 10). \newline - `sort (str, optional)`: Sorting criteria for results (default: "relevance")} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{5.377cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{5.377cm}}{\bf\textcolor{white}{Accessing NCBI Database using efetch()}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{from Bio import Entrez \newline \newline id\_list = {[}"NM\_007294.3", "NM\_000546.5"{]} \newline handle = \seqsplit{Entrez.efetch(db="nucleotide"}, id=id\_list, rettype="gb") \newline records = Entrez.read(handle)} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \SetRowColor{LightBackground} \mymulticolumn{1}{x{5.377cm}}{`db` : The name of the Entrez database to search ("nucleotide", "protein"....) \newline `id` (list or str): A single ID or a list of IDs to retrieve \newline `rettype` (str, optional): The type of information to return (default: "gb" for GenBank format) \newline `retmode` (str, optional): The format to return results in (default: "xml").} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}