\documentclass[10pt,a4paper]{article} % Packages \usepackage{fancyhdr} % For header and footer \usepackage{multicol} % Allows multicols in tables \usepackage{tabularx} % Intelligent column widths \usepackage{tabulary} % Used in header and footer \usepackage{hhline} % Border under tables \usepackage{graphicx} % For images \usepackage{xcolor} % For hex colours %\usepackage[utf8x]{inputenc} % For unicode character support \usepackage[T1]{fontenc} % Without this we get weird character replacements \usepackage{colortbl} % For coloured tables \usepackage{setspace} % For line height \usepackage{lastpage} % Needed for total page number \usepackage{seqsplit} % Splits long words. %\usepackage{opensans} % Can't make this work so far. Shame. Would be lovely. \usepackage[normalem]{ulem} % For underlining links % Most of the following are not required for the majority % of cheat sheets but are needed for some symbol support. \usepackage{amsmath} % Symbols \usepackage{MnSymbol} % Symbols \usepackage{wasysym} % Symbols %\usepackage[english,german,french,spanish,italian]{babel} % Languages % Document Info \author{wangmz} \pdfinfo{ /Title (beautiful-soup.pdf) /Creator (Cheatography) /Author (wangmz) /Subject (Beautiful soup Cheat Sheet) } % Lengths and widths \addtolength{\textwidth}{6cm} \addtolength{\textheight}{-1cm} \addtolength{\hoffset}{-3cm} \addtolength{\voffset}{-2cm} \setlength{\tabcolsep}{0.2cm} % Space between columns \setlength{\headsep}{-12pt} % Reduce space between header and content \setlength{\headheight}{85pt} % If less, LaTeX automatically increases it \renewcommand{\footrulewidth}{0pt} % Remove footer line \renewcommand{\headrulewidth}{0pt} % Remove header line \renewcommand{\seqinsert}{\ifmmode\allowbreak\else\-\fi} % Hyphens in seqsplit % This two commands together give roughly % the right line height in the tables \renewcommand{\arraystretch}{1.3} \onehalfspacing % Commands \newcommand{\SetRowColor}[1]{\noalign{\gdef\RowColorName{#1}}\rowcolor{\RowColorName}} % Shortcut for row colour \newcommand{\mymulticolumn}[3]{\multicolumn{#1}{>{\columncolor{\RowColorName}}#2}{#3}} % For coloured multi-cols \newcolumntype{x}[1]{>{\raggedright}p{#1}} % New column types for ragged-right paragraph columns \newcommand{\tn}{\tabularnewline} % Required as custom column type in use % Font and Colours \definecolor{HeadBackground}{HTML}{333333} \definecolor{FootBackground}{HTML}{666666} \definecolor{TextColor}{HTML}{333333} \definecolor{DarkBackground}{HTML}{3973FA} \definecolor{LightBackground}{HTML}{F2F6FE} \renewcommand{\familydefault}{\sfdefault} \color{TextColor} % Header and Footer \pagestyle{fancy} \fancyhead{} % Set header to blank \fancyfoot{} % Set footer to blank \fancyhead[L]{ \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{C} \SetRowColor{DarkBackground} \vspace{-7pt} {\parbox{\dimexpr\textwidth-2\fboxsep\relax}{\noindent \hspace*{-6pt}\includegraphics[width=5.8cm]{/web/www.cheatography.com/public/images/cheatography_logo.pdf}} } \end{tabulary} \columnbreak \begin{tabulary}{11cm}{L} \vspace{-2pt}\large{\bf{\textcolor{DarkBackground}{\textrm{Beautiful soup Cheat Sheet}}}} \\ \normalsize{by \textcolor{DarkBackground}{wangmz} via \textcolor{DarkBackground}{\uline{cheatography.com/61051/cs/15823/}}} \end{tabulary} \end{multicols}} \fancyfoot[L]{ \footnotesize \noindent \begin{multicols}{3} \begin{tabulary}{5.8cm}{LL} \SetRowColor{FootBackground} \mymulticolumn{2}{p{5.377cm}}{\bf\textcolor{white}{Cheatographer}} \\ \vspace{-2pt}wangmz \\ \uline{cheatography.com/wangmz} \\ \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Cheat Sheet}} \\ \vspace{-2pt}Not Yet Published.\\ Updated 17th May, 2018.\\ Page {\thepage} of \pageref{LastPage}. \end{tabulary} \vfill \columnbreak \begin{tabulary}{5.8cm}{L} \SetRowColor{FootBackground} \mymulticolumn{1}{p{5.377cm}}{\bf\textcolor{white}{Sponsor}} \\ \SetRowColor{white} \vspace{-5pt} %\includegraphics[width=48px,height=48px]{dave.jpeg} Measure your website readability!\\ www.readability-score.com \end{tabulary} \end{multicols}} \begin{document} \raggedright \raggedcolumns % Set font size to small. Switch to any value % from this page to resize cheat sheet text: % www.emerson.emory.edu/services/latex/latex_169.html \footnotesize % Small font. \begin{multicols*}{2} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Basic}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\# \seqsplit{https://www.crummy.com/software/BeautifulSoup/bs4/doc/} \newline from bs4 import BeautifulSoup \newline soup = BeautifulSoup(html\_doc, 'html.parser') \newline \newline \newline soup.title \# \textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{} \newline soup.title.name \# u'title' \newline soup.title.string \# u'The Dormouse's story' \newline soup.title.parent.name \# u'head' \newline \newline \#various finder \newline css\_soup.select("p.strikeout.body") \# css finder \newline soup.p \# \textless{}p class="title"\textgreater{}\textless{}b\textgreater{}The Dormouse's story\textless{}/b\textgreater{}\textless{}/p\textgreater{} \newline soup.p{[}'class'{]} \# u'title' \newline soup.a \# \textless{}a class="sister" \seqsplit{href="http://example.com/elsie"} id="link1"\textgreater{}Elsie\textless{}/a\textgreater{} \newline soup.find\_all('a') \# {[}\textless{}a ..\textgreater{}, ..{]} \newline soup.find(id="link3") \# \textless{}a class="sister" \seqsplit{href="http://example.com/tillie"} id="link3"\textgreater{}Tillie\textless{}/a\textgreater{} \newline for link in soup.find\_all('a'): \newline print(link.get('href')) \# http://example.com/elsi, \# http://example.com/lacie} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Search}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\seqsplit{search.pyhttps://www.crummy.com/software/BeautifulSoup/bs4/doc/} \newline \#-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}- \newline \# css selector \newline \#-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}- \newline css\_soup.select("p.strikeout.body") \newline soup.select("p nth-of-type(3)") \# 3rd child \newline soup.select("head \textgreater{} title") \newline soup.select("p \textgreater{} a:nth-of-type(2)") \newline soup.select("p \textgreater{} \#link1") \# direct child \newline soup.select("\#link1 \textasciitilde{} .sister") \# sibling \newline soup.select('a{[}href{]}') \# existence of an attribute \newline soup.select\_one(".sister") \newline \newline \# attribute value \newline soup.select('a{[}href="http://example.com/elsie"{]}') \# exact attribute \newline soup.select('a{[}href\textasciicircum{}="http://example.com/"{]}') \# negative match \newline soup.select('a{[}href\$="tillie"{]}') \# end match \newline soup.select('a{[}href{\emph{=".com/el"{]}') \# middle match \newline \newline \newline \#-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}- \newline \# basic \newline \#-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}- \newline soup.find\_all('b') \# match by tag \newline soup.find\_all(re.compile("\textasciicircum{}b")) \# match by tag using regex \newline soup.find\_all({[}"a", "b"{]}) \# match by tag in list \newline \newline \# function (complex condition) \newline def \seqsplit{has\_class\_but\_no\_id(tag):} \newline return tag.has\_attr('class') and not tag.has\_attr('id') \newline soup.find\_all(has\_class\_but\_no\_id) \newline \newline \newline \#-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}- \newline \# find\_all\_api \newline \#-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}- \newline find\_all(name, attrs, recursive, string, limit, }}*kwargs) \newline \newline soup.find\_all("title") \# tag condition \newline soup.find\_all("p", "title") \# tag and attr \newline \# {[}\textless{}p class="title"\textgreater{}\textless{}b\textgreater{}The Dormouse's story\textless{}/b\textgreater{}\textless{}/p\textgreater{}{]} \newline soup.find\_all("a") \newline \newline \# keyword arguments \newline soup.find\_all(id="link2") \newline soup.find\_all(href=re.compile("elsie"), id='link1') \newline soup.find(string=re.compile("sisters")) \# text contain sisters \newline \newline \# css class (class is researved keyword) \newline soup.find\_all("a", class\_="sister")} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Make soup}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{soup = \seqsplit{BeautifulSoup(open("index}.html")) \newline soup = BeautifulSoup("\textless{}html\textgreater{}data\textless{}/html\textgreater{}")} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Output}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\# HTML \newline soup.prettify() \#pretty print \newline str(soup) \# non-pretty print \newline \newline \# String \newline soup.get\_text() \#all text under the element} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Encoding}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\#output \newline soup.prettify("latin-1") \newline tag.encode("utf-8") \newline tag.encode("latin-1") \newline tag.encode("ascii")} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Navigation}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\#-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-- \newline \# going up/down/side \newline \#-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-- \newline \# -{}-{}-{}-- going down -{}-{}-{}-- \newline soup.head\# \textless{}head\textgreater{}\textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{}\textless{}/head\textgreater{} \newline soup.title\# \textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{} \newline soup.body.b \# \textless{}b\textgreater{}The Dormouse's story\textless{}/b\textgreater{} \newline soup.a \# \textless{}a class="sister" \seqsplit{href="http://example.com/elsie"} id="link1"\textgreater{}Elsie\textless{}/a\textgreater{} \newline soup.find\_all('a') \newline \# {[}\textless{}a class="sister" \seqsplit{href="http://example.com/elsie"} id="link1"\textgreater{}Elsie\textless{}/a\textgreater{}, \newline \# \textless{}a class="sister" \seqsplit{href="http://example.com/lacie"} id="link2"\textgreater{}Lacie\textless{}/a\textgreater{}, \newline \# \textless{}a class="sister" href="http: \newline \newline \# children = contents \newline head\_tag.contents \# {[}\textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{}{]} \newline head\_tag.children \# {[}\textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{}{]} \newline \newline \# descendants (all of a tag's children, recursively) \newline for child in head\_tag.descendants: \newline print(child) \newline \newline \# .string is tricky \newline head\_tag.contents \# {[}\textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{}{]} \newline head\_tag.string \# u'The Dormouse's story' (because head tag has only one child) \newline print(soup.html.string) \# None (because html has many children) \newline \newline \# whitespace removed strings \newline for string in soup.stripped\_strings: \newline print(repr(string)) \newline \newline \newline \# -{}-{}-{}-- going up -{}-{}-{}-- \newline title\_tag.parent \# \textless{}head\textgreater{}\textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{}\textless{}/head\textgreater{} \newline \# going up recursively \newline link.parents \# {[} p, body, html, {[}document{]}, None{]} \newline \newline \newline \# -{}-{}-{}-- sideway -{}-{}-{}-- \newline \# sibling = include text node \newline sibling\_soup.b.next\_sibling \newline sibling\_soup.c.previous\_sibling \newline \newline \# multiple \newline sibling\_soup.b.next\_siblings \newline sibling\_soup.c.previous\_siblings \newline \newline \# element = not include text node \newline sibling\_soup.b.next\_element \newline sibling\_soup.c.previous\_element \newline sibling\_soup.b.next\_elements \newline sibling\_soup.c.previous\_elements} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} \begin{tabularx}{8.4cm}{X} \SetRowColor{DarkBackground} \mymulticolumn{1}{x{8.4cm}}{\bf\textcolor{white}{Navigation}} \tn \SetRowColor{LightBackground} \mymulticolumn{1}{x{8.4cm}}{\#-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-- \newline \# going up/down/side \newline \#-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-{}-- \newline \# -{}-{}-{}-- going down -{}-{}-{}-- \newline soup.head\# \textless{}head\textgreater{}\textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{}\textless{}/head\textgreater{} \newline soup.title\# \textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{} \newline soup.body.b \# \textless{}b\textgreater{}The Dormouse's story\textless{}/b\textgreater{} \newline soup.a \# \textless{}a class="sister" \seqsplit{href="http://example.com/elsie"} id="link1"\textgreater{}Elsie\textless{}/a\textgreater{} \newline soup.find\_all('a') \newline \# {[}\textless{}a class="sister" \seqsplit{href="http://example.com/elsie"} id="link1"\textgreater{}Elsie\textless{}/a\textgreater{}, \newline \# \textless{}a class="sister" \seqsplit{href="http://example.com/lacie"} id="link2"\textgreater{}Lacie\textless{}/a\textgreater{}, \newline \# \textless{}a class="sister" href="http: \newline \newline \# children = contents \newline head\_tag.contents \# {[}\textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{}{]} \newline head\_tag.children \# {[}\textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{}{]} \newline \newline \# descendants (all of a tag's children, recursively) \newline for child in head\_tag.descendants: \newline print(child) \newline \newline \# .string is tricky \newline head\_tag.contents \# {[}\textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{}{]} \newline head\_tag.string \# u'The Dormouse's story' (because head tag has only one child) \newline print(soup.html.string) \# None (because html has many children) \newline \newline \# whitespace removed strings \newline for string in soup.stripped\_strings: \newline print(repr(string)) \newline \newline \newline \# -{}-{}-{}-- going up -{}-{}-{}-- \newline title\_tag.parent \# \textless{}head\textgreater{}\textless{}title\textgreater{}The Dormouse's story\textless{}/title\textgreater{}\textless{}/head\textgreater{} \newline \# going up recursively \newline link.parents \# {[} p, body, html, {[}document{]}, None{]} \newline \newline \newline \# -{}-{}-{}-- sideway -{}-{}-{}-- \newline \# sibling = include text node \newline sibling\_soup.b.next\_sibling \newline sibling\_soup.c.previous\_sibling \newline \newline \# multiple \newline sibling\_soup.b.next\_siblings \newline sibling\_soup.c.previous\_siblings \newline \newline \# element = not include text node \newline sibling\_soup.b.next\_element \newline sibling\_soup.c.previous\_element \newline sibling\_soup.b.next\_elements \newline sibling\_soup.c.previous\_elements} \tn \hhline{>{\arrayrulecolor{DarkBackground}}-} \end{tabularx} \par\addvspace{1.3em} % That's all folks \end{multicols*} \end{document}