Show Menu
Cheatography

Python (Data Analysis) Cheat Sheet (DRAFT) by

Personal Use

This is a draft cheat sheet. It is a work in progress and is not finished yet.

Pandas Data Frame

df_1 = pd.DataFrame({'A': [0, 1, 2], 
                     'B': [3, 4, 5]})
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], 
                     columns=['A', 'B', 'C'])
-------------------------------------------------
df[:5] #First 5 lines
df.head()
df[df.k­ey­==10] #Where key = 10
df[df.k­ey­==1­0][:5] #First 5 lines & Where key=10
-------------------------------------------------
df_3.iloc[0]
df_3.loc['column-A']
df_3['row-A']
df_3.iloc[1, 3]
df_3.iloc[1:4]
df_3[['row-A', 'row-D']]
df.values.sum()
-------------------------------------------------
df_1 = pd.DataFrame({'A': [0, 1, 2], 
                     'B': [3, 4, 5]})

df_1.shift(axis=0)
     A    B
0  NaN  NaN
1  0.0  3.0
2  1.0  4.0

df_1.shift(axis=1)
    A    B
0 NaN  0.0
1 NaN  1.0
2 NaN  2.0

# To replace NaN with value (eg. 0)
df_1.shift(axis=0).fillna(0) 
-------------------------------------------------
 # To see the difference between columns/rows 
df_1.diff(axis=0)
df_1.diff(axis=1)
-------------------------------------------------
df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [10, 20, 30],
    'c': [5, 10, 15]
})
def add_one(x):
    return x + 1
df.applymap(add_one)

   a   b   c
0  2  11   6
1  3  21  11
2  4  31  16

 # applymap is different from apply in Pandas DF
 #  apply does column by column (or row by row) 
 # instead of element by element 
def standardize_column(column):
    return (column - column.mean())/column.std()	
df.apply(standardize_column)
-------------------------------------------------
 # only works in a single series (column or rows) 
 # from entire DF use apply() 
df.iloc[:, 0].sort_values()
-------------------------------------------------
 # columns vs rows operations, respectively 
df - df.mean(axis=0)
df.sub(df.mean(axis=1),axis=0)
-------------------------------------------------
 # group by 
df.groupby('column').groups
df.groupby(['column1', 'column2']).groups
df.groupby('column').sum() 
df.groupby('column').get_group('value1')
list(df.groupby('column'))
df.groupby('c4')[['c1','c2']].apply(func)

http://wesmckinney.com/blog/groupby-fu-improvements-in-grouping-and-aggregating-data-in-pandas/
-------------------------------------------------
 # merge (join) 
dfLeft.merge(dfRight,
    left_on=['A','B','C'],
    right_on=['A','BB','CC'],
    how='inner')

Numpy Functions

a.max()
Max value
a.argmax()
Index of max value
a.mean­(ax­is=0)
For 2D arrays (Column)
a.mean­(ax­is=­'co­lumns')
a.mean­(ax­is=1)
For 2D arrays (Row)
a.mean­(ax­is=­'in­dex')

Important - Numpy Arrays

import numpy as np

a=np.array([1,2,3,4,5])
slice=a[:3]
slice[0]=100
print a

 [100,2,3,4,5]

RegEx

pattern = "([0-9]){2}"
prog = re.compile(pattern)
match = re.match(prog, str)

if not match:
    print date_str

PyMongo

from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.examples

#what do you want to find
query = { "abc" : "xyz", "ghi": "mop" }
query = { "tuv" : { "$gt" : 1, "$lte" : 100 } }
query = { "abc" : { "$ne" : "xyz" } }
#what information you want to see, removing id
projection = { "_id": 0, "name": 1 }
autos = db.autos.find(query, projection)
#find
db.autos.find().count() #count all
#insert
a = {} #dic
db.autos.insert(a)
#check if doc with field exists & count
db.cities.find({"abc":{"$exists":1}}).count()
#pretty print
db.cities.find().pretty()
#regex
db.cities.find({"abc":{"$regex":"[0-9]"}})
#in - return docs with at least one of the values
db.cities.find({"abc":{"$in":[1,2,3]}})
#all - return docs with all of the values
db.cities.find({"abc":{"$all":[1,2,3]}})
#arrays inside arrays
"n1" : { "n2" : { "n3" : [4] } }
db.city.find( {"n1.n2.n3" : 4} )
#update V1
doc = db.data.find_one({"a":"b"})
doc["new"] = "value"
db.data.save(doc)
#update V2 (adding)
doc = db.data.update({"a":"b"},{"$set": {"c":"d"}})
#update V3 (removing) & multi lines
doc = db.data.update({"a":"b"},
          {"$unset": {"c":""}},multi=True)
#remove
db.data.remove({"a":"b"})
<command line>
mongoimport -d examples -c myautos2 --file autos.json
 

Load Data

# Read from csv file
import csv
def parse_file(datafile):
    data = []
    with open(datafile,'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(row) # List of Lists

with open(input_file, "r") as f:
    reader = csv.DictReader(f)
    header = reader.fieldnames
    rows = list(reader) # Lists of Dic +Header

import unicodecsv
with open('file.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    file_dic = list(reader) # Dic?

import pandas as pd
file_df = pd.read_csv('file.csv', index_col=False, header=0); # Data Frame

Data Types

Dictionary
dictionary = {}
key = 'abc'
value = '123'
dictionary[key] = value
for key in dictionary:
    print(dictionary[key])
 123

for key, value in dictionary.items():
    print(key)
    print(value)
 abc
 123

Set
uniq_dataset = set()
uniq_dataset.add('A')
uniq_dataset.add('A')
uniq_dataset.add('B')
print(uniq_dataset)
 {'A', 'B'}

Panda Series

import pandas as pd
-------------------------------------------------
s1 = pd.Series([1, 2, 3, 4], 
               index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40],
               index=['b', 'd', 'a', 'c'])
print s1 + s2

a    31
b    12
c    43
d    24
dtype: int64
-------------------------------------------------
Obs: NaN if the index does not exists for both
res = s1 + s2
res.dropna()
or
s1.add(s2,fill_value=0)
-------------------------------------------------
s = pd.Series([1, 2, 3, 4, 5])
    def add_three(x):
        return x + 3
    print s.apply(add_three)
	
4 5 6 7 8

Indexes and Slices

len(a)
6
a[0]
0
a[5]
5
a[-1]
5
a[-2]
4
a[1:]
[1,2,3­­,4,5]
a[:5]
[0,1,2­­,3,4]
a[:-2]
[0,1,2,3]
a[1:3]
[1,2]
a[1:-1]
[1,2,3,4]
b=a[:]
Shallow copy of a
Indexes and Slices of a=[0,1­­,2­,­3­,4,5]

XML

import xml.etree.ElementTree as ET
tree = ET.parse('country_data.xml')
root = tree.getroot() # First tag

for child in root:
     print child.tag, child.attrib
	 
print root[0][1].text

for country in root.findall('country'):
    rank = country.find('rank').text
    name = country.get('name')
    print name, rank

<au id="A1" ca="yes">
    <snm>Mei-Dan</snm>
    <fnm>Omer</fnm>
    <insr iid="I1"/>
    <insr iid="I2"/>
    <email>omer@extremegate.com</email>
</au>	

for author in root.findall('./fm/bibl/aug/au'):
	 insr = []
    for i in author.findall('./insr'):
        insr.append(i.get('iid'))
        #insr.append(i.attrib["iid"])
			
    data = {
        "fnm": author.find('./fnm').text,
        "snm": author.find('./snm').text,
        "email": author.find('./email').text,
        "insr": insr
    }

HTML Requests

from bs4 import BeautifulSoup
import requests

html_page = "page_source.html"

with open(page) as fp:
    soup = BeautifulSoup(fp, "lxml")
        
r = requests.post("http://www.transtats.bts.gov/
Data_Elements.aspx?Data=2",
  data={'AirportList': "BOS",
    'CarrierList': "VX",
    'Submit': 'Submit',
    "__EVENTTARGET": "",
    "__EVENTARGUMENT": "",
    "__EVENTVALIDATION": soup.find(id="__EVENTVALIDATION"),
    "__VIEWSTATE": soup.find(id="__VIEWSTATE")
  })
                    
print r.text
 

Write to CSV

import csv

with open("output.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(your_list_of_lists)

Dates

from datetime import datetime as dt

date_str = '2017-04-19'
date_dt = dt.strptime(date_str, '%Y-%m-%d')

print(date_dt)
 2017-04-19 00:00:00

dt.strptime(date_str, '%Y-%m-%dT%H:%M:%S%z')

datetime(2000, 1, 1)

If

if x not in dictionary and y != z:
   print()

if not x or y > 7:
   print()

if x <= y <= z:
   print()

if "a" in "abc":
   print()

Loops

for i in range(list.size):

for e in list:

# Iterate through two lists in parallel
# zip stops when the shorter of foo or bar stops
for a,b in list(zip(foo,bar)):

Lists - General

list = []
new list
list.a­ppe­nd(­'sh­emp')
appendat end
list.i­nse­rt(0, 'xxx')
insert at index 0
print list.i­nde­x('­curly')
2
list.e­xte­nd(­['yyy', 'zzz'])
list of elems at end
list.r­emo­ve(­'cu­rly')
search and remove
list.p­op(1)
remove and return

List of Dictio­naries

people = [
    {'name': "Tom", 'age': 10},
    {'name': "Mark", 'age': 5},
    {'name': "Pam", 'age': 7}
]

list(filter(lambda person: person['name'] == 'Pam', people))
 [{'age': 7, 'name': 'Pam'}]

people[1]['name']
 Mark

Dictionary of Lists

from collections import defaultdict

s = [('yellow', 1), ('blue', 2), ('yellow', 3), ('blue', 4), ('red', 1)]
d = defaultdict(list)
for k, v in s:
    d[k].append(v)

d.items()
 [('blue', [2, 4]), ('red', [1]), ('yellow', [1, 3])]
The idea is to group the values by the keys

Matplotlib

data = [1, 2, 1, 3, 3, 1, 4, 2]

import matplotlib.pyplot as plt
plt.hist(data)

To show it:
%matplotlib inline (for notebooks)
or
plt.show()

Statistics

#Correlation
std_x = (x - x.mean()) / x.std(ddof=0)
std_y = (y - y.mean()) / y.std(ddof=0)
correlation = (std_x * std_y).mean()
#By default, Pandas' std() function computes the standard deviation using Bessel's correction. Calling std(ddof=0) ensures that Bessel's correction will not be used. NumPy's corrcoef() function can be used to calculate Pearson's r, also known as the correlation coefficient.

Excel Files

import xlrd
datafile = "file.xls"

workbook = xlrd.open_workbook(datafile)
sheet = workbook.sheet_by_index(0)

sheet_data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]

# (column, line)
sheet.cell_type(3, 2)
sheet.cell_value(3, 2)
# (column, start_line, end_line)
sheet.col_values(3, start_rowx=1, end_rowx=4)

sheet.ncols

xlrd.xldate_as_tuple(cell_in_exceltime, 0)

Pretty Print

Map, Filter and Reduce

#Map
map(function_to_apply, list_of_inputs)
map(float, list) #Convert all in list to float

JSON requests (WS)

import json
import requests

BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"

params["fmt"] = "json"
params["query"] = "artist:Avril"

r = requests.get(url + uid, params)
print r.json()

Pkg

!pip install <link>