Pandas
import pandas as pd |
df.iloc[:5,:] |
return slice of data:all columns first 5 rows |
type(df) |
DataFrame |
df.shape |
(len, #ofcols) |
df.columns |
name of cols |
df.index |
return index column |
df.head(3) |
return first 3 rows |
df.iloc[-5:,:] |
return last 5 rows |
df.tail() |
return last 5 rows |
df.info() |
return index, column types, # of row, # of not null cols |
type(df['low']) |
Series |
type(df['low'].values) |
numpy.ndarray |
np.log10(df['low']) |
return data frame |
np.log10(df['low'].values) |
return list of list |
Each column in pandas is a Series.
You can run numpy on df or a col of df
Statistical Data Analysis
df.describe() |
count, mean,std,max, quartiles for each col of non-null rows |
df['low'].count() |
return # of not null rows |
df[cols].count() |
return a series |
df['low'].mean() |
return mean ignoring nulls |
df.std() |
df.median() |
df.quantile(q) |
q=.5:median q=[.25,.75]:IQrange |
df['low'].min() |
alphabetic order for non-numerics |
df['low'].max() |
alphabetic order for non-numerics |
df.mean(axis='columns') |
mean of all columns for each row |
df.low |
df['low'] |
Time series
index_col='Date', parse_date=True |
df.loc['2015-2'] |
return for all days |
df.loc['2015-2-20'] |
return all rows with this date |
df.loc['2015-2-20': '2015-3'] |
range |
newD = pd.to_datetime('Date' ) |
y-m-d h:m:s |
df.reindex(newD) |
reindexing with matching dates. if doesn't match,new rows w. null value |
df.reindex(newD,method='ffill') |
fill empty values forward fill:value of previous rows |
method='bfill' |
backward fill: value of later rows |
df.resample('D').mean() |
daily mean |
'H', 'min', '2W' |
hour, minute, 2 weekshour, minute, 2 weeks |
'Y', 'Q', 'M', 'B' |
year, quarter, month, business day |
df.resample('W').sum().max() |
max of weekly sum |
df.resample('4h').ffill() |
every 4hours. fill nan w. previous valuesevery 4hours. fill nan w. previous values |
df1+df2 |
df['Temperature']['2010-august'] |
select temp col of aug. |
df['Temperature']['2010-2'] |
select temp col of feb. |
unsmooth.rolling(window=24).mean() |
moving average 24h |
df['type'].str.upper() |
return a column converted to uppercase |
df['product'].str.contains('ware') |
return boolean if substring 'ware' exists |
True+True |
2 |
False + False |
0 |
df['product'].str.contains('ware').sum() |
# of rows contains substring 'ware' |
df['date'].dt.hour |
return hour of each row 0-23 |
df['date'].dt.tz_localize('US/Central') |
set timezone |
df['date'].dt.tz_convert('US/Eastern') |
df['date'].resample('A').first() |
yearly from the initial date in data (1960-12-31) |
df['date'].resample('A').first().interpolate('linear') |
replace nan with interpolation |
df.columns.str.strip() |
removes space from df.columns |
df.set_index('Date', inplace=True) |
newD = pd.to_datetime('Date_list', format='%Y-%M-%D %H:%M) |
pd.Series(Columns_list, index=newD) |
Construct a pandas Series c |
ts2_interp = ts2.reindex(ts1.index).interpolate(how='linear') |
Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp |
timezone.dt.tz_localize('US/Central') |
localize the local time timezone to 'US/Central |
|
|
Build DF
df=pd.read_csv("filepath", index_col=0) |
add index column 0-len(inp) |
index_col='nameofacolumn' |
df.index=['A', 'B', ...] |
assign index to df. len(index)==len(df) |
pd.DataFrame({'id':[1,2,3], 'gen':'M'}) |
key: columns, values: row |
pd.DataFrame(dict_of_lists) |
zipped=list(zip(list_labels, list_values)) |
pd.DataFrame(dict(zipped)) |
list_labels, list_values = list of list |
pd.read_csv("filepath", header=None) |
no header |
pd.read_csv("filepath", options) |
col_n:list of column names |
hearder=0, names=col_n |
rename the header |
header=None, names=col_n |
no header in file & header is col_n |
na_values='-1' |
convert specific value (-1) to a nan |
na_values={'colname':['-1', '']} |
define a dic for each col |
parse_dates=[[0,1,1]] |
convert 3 columns of date to one col |
parse_dates=True |
convert column with date to dateformat |
delimiter=' ' |
header=3 |
header is in index 3 |
comment='#' |
ignore all lines start with '#' in the input |
index_col = 'dates' |
set a column as index |
df[cols] |
take specific columns |
df.to_csv('outputpath') |
df.to_excel('outputpath') |
pd.DataFrame({'smoothed':smoothed, 'unsmoothed':unsmoothed}) |
create df.if they have index, will merge based on index |
categorical
df['type'].decribe() |
count not null,# of unique,top item,freq. of top |
df['type'].unique() |
#of unique items |
df.loc[df['type']==x,:] |
df[df['type']==x] |
del def['type'] |
delete a column |
Numpy+Df
df.values |
Create array of DataFrame values |
df[colname]=0 |
create a columns with zero elements in df |
Cleanning
df_dropped = df.drop(list_to_drop, axis='columns') |
Remove the appropriate columns list_to_drop |
df.set_index(colname) |
Set colname as the index |
pd.to_numeric() |
It converts a Series of values to floating-point values. Furthermore, by specifying the keyword argument errors='coerce', you can force strings like 'M' to be interpreted as NaN. |
df.reset_index()[colname] |
Extract the colname column from df using .reset_index() |
df.loc[df[colname]=='sth'] |
choose the rows in df for df[colname]='sth' |
df.loc[df[colname].str.contain('sth')] |
choose the rows in df where the column df[colname] contain 'sth' |
|
|
Plot
import matplotlib.pyplot as plt |
plt.plot(df['low'].values) |
x axis= index of value |
plt.show() |
show the image |
plt.plot(df['low']) |
x axis is index of df (eg date) |
df['low'].plot() |
plot series directly. has also x label |
df.plot() |
show all columns in df with legend |
plt.yscale('log') |
log scale on vertical axis |
df['low'].plot(color='b',style='.-', legend=True) |
plt.axis((minx, maxx,miny,maxy)) |
zoom |
plt.title('title') |
plt.ylabel('label) |
plt.xlabel('xlabel') |
plt.savefig('a.pdf') |
plt.savefig('a.jpg') |
df.plot(subplots=True) |
Draw each column in one subplot. |
df.plot(x='colname',y='colname',kind='scatter') |
plot 2 columns |
kind = 'box' |
box plot |
kind = 'hist' |
histogram |
kind='area' |
bins=30 |
integer:#of bins |
range=(4,8) |
tuple (min,max) |
normed=True |
boolean. normalize to one for hist |
cumulative=True |
boolean for hist |
alpha=0.3 |
visibility of several histograms |
s=sizes |
sizes= array of size of each circle in scatter plot |
fig, axes=subplots(nrows=1,ncols=1) |
df['low'].plot(ax=axes[0], ...) |
...: kind, bins, normed,cumulative |
df.plot(y='colname',kind='box') |
style='k.-' |
color,marker,line type |
plt.clf() |
clears the entire current figure with all its axes, but leaves the window opened, such that it may be reused for other plots |
Indexing
df['colname']['rowname'] |
rowname is index_col |
df.colname['rowname'] |
df.loc['rowname','colname'] |
df.loc['rownstart','rownend',:] |
row names are inclusive. |
df[['low']] |
returns a single column data frame |
df['low'] |
returns a series with index of df |
|