Show Menu
Cheatography

Data Wrangling (Python) Cheat Sheet (DRAFT) by

This is a draft cheat sheet. It is a work in progress and is not finished yet.

Read / Write .csv

# csv
df = pd.read_csv('file.csv', nrows = 5) 
pd.to_csv('file.csv')
# excel 
df = pd.read_excel('file.xlsx')
df = pd.read_excel(pd.ExcelFile('file.xlsx'), 'Sheet1')
pd.to_excel('file.xslx', sheet_name='Sheet1')

Meta Data

df.info()
df.col­umn­s.v­alues
 df.shape() 
df.ind­ex.v­alues
 len(df) 
len(df.co­lumns) 

Arrange

df.rename(columns = {'col1': 'rename1', 
                     'col2': 'rename2'})
df[['col1', 'col2', ... ]] # order cols
df.sort_values([col1', 'col2'], 
               ascending = [True, False])

Filter

df[(df.col1 > 1000) | (df.col2 != "A")]
df[collist] # collist = ['col1', ...]
df.iloc[0:5, :] / df.head(5) # by position
df.loc[(df.col1 > 5) & (df.col2 == "A"), 
       ['col1', 'col2']] # by label
df.drop(['col1', ...], axis = 1)
df.drop_duplicates()
df.sample(frac=0.5 / n=10)
df.filter(regex = 'regex')

Useful Functions

.count() # non-NA
.min()
/
.max()
.sum()
.descr­ibe()
.cumsum()
.mean()
/
.median()
.quant­ile­([0.25, 0.75])
.var()
/
.std()
.apply­(fu­nction)
df['co­l'].nu­nique()
df['co­l'].va­lue­_co­unts()
np.nan
pd.isn­ull­(obj)
pd.not­nul­l(obj)
pd.to_­dat­eti­me(var) # others similar
.abs()
.clip(­low­er=-10, upper=10)
df.col­nam­e.i­sin­(list)
.corr()
 

Write Functions

def function_name(var1, var2, ...):
        for i in 1:10
                if line1:
                       do this
                elif line2:
                       do that
                else:
                       do none
        return(result)

Applying functions

f = lambda x: x*2
df.apply(f)
df.applymap(f) # element wise

Summarise

df.groupby(by='col').function() # function = sum() ...
df.groupby(by='col').agg(function) # function = sum() ...
df.groupby(by='col').size()

Join

pd.merge(df1, df2, how = 'left/right/inner/outer', on='col')
df[~df['col'].isin(df2['col'])] # anti-join

Method Chaining

df = (pd.melt(df)
         .rename(columns={
                       'variable' : 'var',
                       'value' : 'val'})
         .query('val >= 200')
        )

New variable / column

df['Volume'] = df.Length  df.Height  df.Depth
pd.qcut(df.col, n, labels=False) # binning

Reshape Data

pd.melt(df) # cols into rows
pd.pivot(columns='var', values='val') # rows into cols
pd.concat([df1, df2]) # stack two dfs