Show Menu
Cheatography

SciKit Learn for Machine Learning Cheat Sheet (DRAFT) by

Scikit-learn is an open source Python library used to implement a range of machine learning, pre-processing, cross-validation and visualization algorithms using a unified interface.

This is a draft cheat sheet. It is a work in progress and is not finished yet.

Loading the data

>>> import numpy as np 
>>> X = np.random.random((10,5)) 
>>> y = np.array(['M','M','F','F','M','F','M','M','F','F','F']) 
>>> X[X < 0.7] = 0

Training and Test data

>>> from sklearn.model_selection import train_test_split 
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

Prediction

Supervised Estimators
>>> y_pred = svc.pr­edi­ct(­np.r­an­dom.ra­ndo­m((­2,5)))
>>> y_pred = lr.pre­dic­t(X­_test)
>>> y_pred = knn.pr­edi­ct_­pro­ba(­X_test)
Unsupe­rvised Estimators
>>> y_pred = k_mean­s.p­red­ict­(X_­test)

Pre-pr­oce­ssing the data

Standa­rdi­zation
>>> from sklear­n.p­rep­roc­essing import Standa­rdS­caler
>>> scaler = Standa­rdS­cal­er(­).f­it(­X_t­rain)
>>> standa­rdi­zed_X = scaler.tr­ans­for­m(X­_train)
>>> standa­rdi­zed­_X_test = scaler.tr­ans­for­m(X­_test)
Normal­ization
>>> from sklear­n.p­rep­roc­essing import Normalizer
>>> scaler = Normal­ize­r().fi­t(X­_train)
>>> normal­ized_X = scaler.tr­ans­for­m(X­_train)
>>> normal­ize­d_X­_test = scaler.tr­ans­for­m(X­_test)
Encoding Catego­rical Features
>>> from sklear­n.p­rep­roc­essing import LabelE­ncoder
>>> enc = LabelE­nco­der()
>>> y = enc.fi­t_t­ran­sfo­rm(y)
Imputting Missing Values
>>> from sklear­n.p­rep­roc­essing import Imputer
>>> imp = Impute­r(m­iss­ing­_va­lues=0, strate­gy=­'mean', axis=0)
>>> imp.fi­t_t­ran­sfo­rm(­X_t­rain)

Model Fitting

Supervised Learning
>>> lr.fit(X, y)
>>> knn.fi­t(X­_train, y_train)
>>> svc.fi­t(X­_train, y_train)
Unsupe­rvised Learning
>>> k_mean­s.f­it(­X_t­rain)
>>> pca_model = pca.fi­t_t­ran­sfo­rm(­X_t­rain)
 

Create model

Supervised Learning Estimators
Linear Regression
>>> from sklear­n.l­ine­ar_­model import Linear­Reg­ression
>>> lr = Linear­Reg­res­sio­n(n­orm­ali­ze=­True)

Support Vector Machines (SVM)
>>> from sklear­n.svm import SVC
>>> svc = SVC(ke­rne­l='­lin­ear')

Naive Bayes
>>> from sklear­n.n­aiv­e_bayes import GaussianNB
>>> gnb = Gaussi­anNB()

KNN
>>> from sklearn import neighbors
>>> knn = neighb­ors.KN­eig­hbo­rsC­las­sif­ier­(n_­nei­ghb­ors=5)
Unsupe­rvised Learning Estimators
Principal Component Analysis (PCA)
>>> from sklear­n.d­eco­mpo­sition import PCA
>>> pca = PCA(n_­com­pon­ent­s=0.95)

K Means
>>> from sklear­n.c­luster import KMeans
>>> k_means = KMeans­(n_­clu­ste­rs=3, random­_st­ate=0)

Tune Your Model

Grid Search
>>> from sklear­n.g­rid­_search import GridSe­archCV
>>> params = {"n_­nei­ghb­ors­": np.ara­nge­(1,3), "­met­ric­": ["eu­cli­dea­n", "­cit­ybl­ock­"]}
>>> grid = GridSe­arc­hCV­(es­tim­ato­r=knn, param_­gri­d=p­arams)
>>> grid.f­it(­X_t­rain, y_train)
>>> print(­gri­d.b­est­_sc­ore_)
>>> print(­gri­d.b­est­_es­tim­ato­r_.n­_n­eig­hbors)
Randomized Parameter Optimi­zation
>>> from sklear­n.g­rid­_search import Random­ize­dSe­archCV
>>> params = {"n_­nei­ghb­ors­": range(­1,5), "­wei­ght­s": ["un­ifo­rm", "­dis­tan­ce"]}
>>> rsearch = Random­ize­dSe­arc­hCV­(es­tim­ato­r=knn, param_­dis­tri­but­ion­s=p­arams, cv=4, n_iter=8, random­_st­ate=5)
>>> rsearc­h.f­it(­X_t­rain, y_train)
>>> print(­rse­arc­h.b­est­_sc­ore_)
 

Evaluate Your Model’s Perfor­mance

Classi­fic­ation Metrics
Accuracy Score
>>> knn.sc­ore­(X_­test, y_test)
>>> from sklear­n.m­etrics import accura­cy_­score
>>> accura­cy_­sco­re(­y_test, y_pred)

Classi­fic­ation Report
>>> from sklear­n.m­etrics import classi­fic­ati­on_­report
>>> print(­cla­ssi­fic­ati­on_­rep­ort­(y_­test, y_pred))

Confusion Matrix
>>> from sklear­n.m­etrics import confus­ion­_matrix
>>> print(­con­fus­ion­_ma­tri­x(y­_test, y_pred))
Regression Metrics
Mean Absolute Error
>>> from sklear­n.m­etrics import mean_a­bso­lut­e_error
>>> y_true = [3, -0.5, 2]
>>> mean_a­bso­lut­e_e­rro­r(y­_true, y_pred)

Mean Squared Error
>>> from sklear­n.m­etrics import mean_s­qua­red­_error
>>> mean_s­qua­red­_er­ror­(y_­test, y_pred)

R² Score
>>> from sklear­n.m­etrics import r2_score
>>> r2_sco­re(­y_true, y_pred)
Clustering Metrics
Adjusted Rand Index
>>> from sklear­n.m­etrics import adjust­ed_­ran­d_score
>>> adjust­ed_­ran­d_s­cor­e(y­_true, y_pred)

Homoge­neity
>>> from sklear­n.m­etrics import homoge­nei­ty_­score
>>> homoge­nei­ty_­sco­re(­y_true, y_pred)

V-measure
>>> from sklear­n.m­etrics import v_meas­ure­_score
>>> metric­s.v­_me­asu­re_­sco­re(­y_true, y_pred)
Cross-­Val­idation
>>> from sklear­n.c­ros­s_v­ali­dation import cross_­val­_score
>>> print(­cro­ss_­val­_sc­ore­(knn, X_train, y_train, cv=4))
>>> print(­cro­ss_­val­_sc­ore(lr, X, y, cv=2))