In this tutorial we demonstrate a multivariate analysis using a machine learning toolkit scikit-learn. Here we will train a Random Forest to discriminate continuum from BBbar events.
from root_numpy import *
import numpy as np
plt= matplotlib.pyplot
np.random.seed(12345)
import sklearn ## This is just for printing the version number below. It's not needed for the rest of the tutorial
print 'This tutorial is tested with scikit-learn version', sklearn.__version__
Training data. Signal is a correlated 2D Gaussian; background is flat
# np.random.multivariate_normal generates n-dim Gaussian distributions with given mean and covariance matrix
X_sig= np.random.multivariate_normal(mean=[0.0, 0.0], cov=[ [1,0.5], [0.5,1] ], size= 1000)
# np.random.rand generates flat random numbers between 0 and 1 and fill them in an array of a given shape
X_bkg= np.random.rand(1000,2)
X_bkg= (X_bkg-0.5) * 10 # Re-center and re-scale
fig1= figure(figsize=(6,6))
scatter(X_bkg[:,0], X_bkg[:,1], c='b', alpha=0.4)
scatter(X_sig[:,0], X_sig[:,1], c='g', alpha=0.4)
# Confirm that the shape of the arrays is (n_samples, n_feature)
# Here we have two features (or variables) and 1000 events each.
print X_sig.shape
print X_bkg.shape
# Merge signal and background samples, and create an array to hold their class indices
X_train= np.concatenate([X_bkg, X_sig])
c_train= np.array( [0]*len(X_bkg) + [1]*len(X_sig) )
print X_train.shape
print c_train.shape, c_train
Get a Decision Tree, and train it.
from sklearn import tree
# Hitting [Tab] at tree.DecisionTreeClassifier(, you will see the documentation with available options, like this:
# tree.DecisionTreeClassifier(self, criterion='gini', max_depth=None, min_samples_split=1, min_samples_leaf=1,
# min_density=0.1, max_features=None, compute_importances=False, random_state=None)
# A decision tree classifier.
# ...
clf= tree.DecisionTreeClassifier(min_samples_split= 20, min_samples_leaf= 10)
# Train or fit to the training sample
clf.fit(X_train, c_train)
After the training, the model can be used to predict new events:
print clf.predict([3.0, 0.0])
print clf.predict([1.0, 1.0])
Or calculate the probability of being each classes
# Can take an array of variables (x,y)
print clf.predict_proba( [ (2.0, -1.0), (0.0, 1.0) ] )
# First column is the prabability of being class 0, and second is class 1.
Visualize the result
plot_step= 0.05
# Create a meshgrid as our "test sample"
xmin, xmax= X_train[:,0].min(), X_train[:,0].max()
ymin, ymax= X_train[:,1].min(), X_train[:,1].max()
xx, yy = np.meshgrid(np.arange(xmin, xmax, plot_step), np.arange(ymin, ymax, plot_step) )
# (xx[i][j], yy[i][j]) makes up the coordinates of grid point (i,j)
print xx.shape, yy.shape
# Combine xx and yy and reshape it to (n_samples, n_features)
# xx.ravel() flattens the xx array.
# np.c_[] translates slice objects to concatenation along the second axis.
X_test= np.c_[xx.ravel(), yy.ravel()]
print X_test.shape
# Predict the class
Z = clf.predict( X_test )
print Z.shape
# Reshape Z to be the same as xx and yy
Z= Z.reshape( xx.shape )
print Z.shape
# Plot the result
fig2= figure(figsize=(6,6))
cs = contourf(xx, yy, Z, cmap= plt.cm.winter, alpha=0.3)
# overlay training samples
scatter(X_bkg[:,0], X_bkg[:,1], c='b', alpha=0.5)
scatter(X_sig[:,0], X_sig[:,1], c='g', alpha=0.5)
from sklearn.ensemble import RandomForestClassifier
ls data
bb = root2rec('data/B0B0bar-BtoDpi-all.root')
cc = root2rec('data/cc-BtoDpi-all.root')
# Look at the content of the data, variable names and their types
print bb.dtype
Plot all variables we are interested in. They all belong to the "B" block in the original root file
# Shape variables that we are using to distinguish continuum and BBbar events
shapevars1= ['BCosSphr','BCosThetaT','BCosThrust','BLegendreP0','BLegendreP2','BR2ROE','BThrust','BThrustROE','BSphr','BSphrROE']
shapevars2= [ 'BPFlow'+str(i) for i in range(9)]
shapevars= shapevars1+shapevars2 # concatenate two lists
kinvars= ['BpostFitMes', 'BpostFitDeltaE']
# Stretch out (flatten) arrays in each entry but still keep the variable structure
# stretch is a utility function in root_numpy
fbb = stretch(bb, kinvars+shapevars)
fcc = stretch(cc, kinvars+shapevars)
print fbb.dtype
fig= figure(figsize=(18,12))
datalabels= [r'$c\overline{c}$',r'$B\overline{B}$']
for j,vname in enumerate(shapevars):
subplot(4,5,j+1)
h,b,p= hist([fcc[vname], fbb[vname]], bins=40, alpha=0.4, label=datalabels, histtype='stepfilled')
legend().get_frame().set_alpha(0.5)
title(vname)
Prepare training and testing samples
# Randomize the event order
ibb= np.random.permutation(len(fbb)) # A random sequence of index [0... len(bb)-1]
icc= np.random.permutation(len(fcc))
rbb= fbb[ibb]
rcc= fcc[icc]
# The classifier doesn't take structured recarray; it only takes flattened regular array with the shape (n_entries,n_features)
# See http://www.scipy.org/Cookbook/Recarray
arbb= rbb[shapevars].view(('<f4',len(shapevars)))
arcc= rcc[shapevars].view(('<f4',len(shapevars)))
print arbb.shape, arcc.shape
# Divide each data set to training and testing sets
bb_train, bb_test = arbb[:len(arbb)/2], arbb[len(arbb)/2:]
cc_train, cc_test = arcc[:len(arcc)/2], arcc[len(arcc)/2:]
print bb_train.shape, bb_test.shape, cc_train.shape, cc_test.shape
# Concatenate background and signal data sets
X_train = np.concatenate([cc_train, bb_train])
X_test = np.concatenate([cc_test, bb_test])
# Create the corresponding arrays for classes of events
y_train= np.array([0]*len(cc_train)+[1]* len(bb_train))
y_test= np.array([0]*len(cc_test)+[1]* len(bb_test))
print len(X_train), len(X_test)
print len(y_train), y_train
print len(y_test), y_test
Get the classifier and train it
def RF_train(X_train, y_train, X_test, y_test, labels, mytitle=None, **rfargs):
'''
Random Forest training. Make probability histogram plots and return
( classifier, histogram of probabilities of being 0 for data d0 and d1, histogram binning)
*X_train* : training data set
*y_train* : classes of training data set
*X_test* : test data set
*y_test* : classes of test data set
*labels* : list of labels for the legend
other arguments will be passed to RandomForestClassifier()
'''
# Classifier
clf = RandomForestClassifier(**rfargs)
# Train it!
clf.fit(X_train, y_train)
# Calculate the probabilities of bein class 1: ys is a list of arrays of classifier output for test data
# clf.predict_proba(X) returns an 2D array with shape (n_events, n_classes). Here we only pick the slice
# for probability of being class 1 with [:,1].
# ys is a list of two (1D) arrays
ys= [ clf.predict_proba(X)[:,1] for X in [ X_test[y_test==0], X_test[y_test==1] ] ]
# Plot histograms
hists, bins, patches= plt.hist( ys, bins=linspace(0,1,51), histtype='stepfilled', alpha=0.5, label=labels, normed=True)
legend(loc='upper center')
if ( mytitle ):
title(mytitle)
return clf, hists, bins
fig1= figure(figsize=(16,4))
datalabels= [r'$c\overline{c}$',r'$B\overline{B}$']
# xp/yp: stores classifier output histograms of test data for class 0/1, for each classifier configuration
hc0=[]
hc1=[]
clflabels=[]
plt.subplot(131)
clf1, hists1, bins= RF_train(X_train, y_train, X_test, y_test, labels=datalabels, mytitle='n5-20', n_estimators=5, min_samples_leaf=20)
hc0.append(hists1[0])
hc1.append(hists1[1])
clflabels.append('n5-20')
plt.subplot(132)
clf2, hists2, bins= RF_train(X_train, y_train, X_test, y_test, labels=datalabels, mytitle='n10-40', n_estimators=10, min_samples_leaf=40)
hc0.append(hists2[0])
hc1.append(hists2[1])
clflabels.append('n10-40')
plt.subplot(133)
clf3, hists3, bins= RF_train(X_train, y_train, X_test, y_test, labels=datalabels, mytitle='n15-80', n_estimators=15, min_samples_leaf=80)
hc0.append(hists3[0])
hc1.append(hists3[1])
clflabels.append('n15-80')
Compare performances
# Print the scores (the mean accuracy) on test data set (and, for what is worth, the training set)
print ' config. test train'
print '--------+-------+--------'
for c,b in zip([clf1,clf2,clf3], clflabels):
print '%7s : %6.4f %6.4f' % ( b, c.score(X_test, y_test), c.score(X_train, y_train) )
Plot receiver operating characteristic (ROC) curves. Here we plot the efficiencies of selecting events that have higher values of classifier output than a given cut.
figure(figsize=(6,6))
for x,y,b in zip(hc0,hc1,clflabels):
plot(1-x.cumsum()/x.sum(), 1-y.cumsum()/y.sum(), label=b)
xlim(0,1)
ylim(0,1)
grid()
legend(loc='center right')
xlabel(r'$c\overline{c}$ efficiency', fontsize= 'x-large')
ylabel(r'$B\overline{B}$ efficiency', fontsize= 'x-large')
Save the classifier to a file using pickle for later use.
import pickle
# Just save one of the classifiers we used.
pickle.dump(clf2, open('random-forest.p','wb'))
Load the classifier back to use
myclf= pickle.load( open('random-forest.p', 'rb') )
# array of classifier output on testing data set
bb_clfout= myclf.predict_proba(bb_test)[:,1]
cc_clfout= myclf.predict_proba(cc_test)[:,1]
both_clfout= np.concatenate((bb_clfout,cc_clfout))
# test data set including B candidate kinetic variables
rbb_test= rbb[len(rbb)/2:]
rcc_test= rcc[len(rcc)/2:]
rboth_test= np.concatenate((rbb_test, rcc_test)).view(np.recarray)
figure(figsize(16,8))
dlabs= [r'$B\overline{B}$', r'$c\overline{c}$']
for j,cut in enumerate([0, 0.3, 0.6]):
subplot(2,3,j+1)
histde= [ rboth_test.BpostFitDeltaE[both_clfout>cut], rcc_test.BpostFitDeltaE[cc_clfout>cut] ]
hist( histde, bins=50, histtype='stepfilled', label=dlabs, color=['g','b'] )
legend(loc='upper left')
xlabel(r'$\Delta E$', fontsize='x-large')
title( 'clf > '+str(cut) )
subplot(2,3,j+4)
histmes= [ rboth_test.BpostFitMes[both_clfout>cut], rcc_test.BpostFitMes[cc_clfout>cut] ]
hist( histmes, bins=50, histtype='stepfilled', label=dlabs, color=['g','b'] )
legend(loc='upper left')
xlabel(r'$m_{\rm ES}$', fontsize='x-large')