TRDclf.py

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.models import Sequential
from tensorflow import keras
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import ROOT
import matplotlib.pyplot as plt
import numpy as np
import random
from keras.callbacks import Callback
import math
from sklearn.preprocessing import StandardScaler
import pandas as pd
import seaborn as sn
import os
import joblib
from keras.utils import np_utils
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score, roc_curve, accuracy_score
import matplotlib.transforms as mtransforms
from keras.utils.vis_utils import plot_model
import matplotlib.image as mpimg
from sklearn.metrics import roc_curve, roc_auc_score
from PIDclass import *
import pickle
import xgboost as xgb
from hipe4ml.model_handler import ModelHandler
from matplotlib.ticker import NullFormatter, MaxNLocator
from itertools import compress, product

from matplotlib.colors import ListedColormap

from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


######################################Load DATA CLASSES####################################

# Reload = False
# TrainVar = TrainVar = ['TRDe1','TRDe2','TRDe3','TRDe4'] #'NoTrdHit'
# if Reload == False:
#     Like = PID( test_size = 0.5, electrons=50000,  hadrons=50000,train_var = TrainVar, cut = [['NoTrdHit',4,99]])
#     with open('PID.pkl', 'wb') as file:
#         pickle.dump(Like, file)
# if Reload == True:
#     with open('PID.pkl', 'rb') as file:
#         Like = pickle.load(file)


# train_test_data = [pd.DataFrame(Like.X_train, columns=TrainVar)  ,Like.y_train, pd.DataFrame(Like.X_test, columns=TrainVar)  ,Like.y_test ]
# np.save("Y",np.array(Like.y_test) )
# y_predict_Likelihood = np.array(list(Like.X_testTotal))[:,list(np.load("PID_BIB.npy")).index('Likelihood')] #here the data is already squeezed between 0 and 1
# np.save("y_predict_Likelihood" ,np.array(y_predict_Likelihood) )


###########################################################################################
###RUN Models#############


def runANN():
    ann_model = Sequential([
        Dense(18, input_shape=(len(TrainVar),), activation="sigmoid"),
        Dense(9, activation="relu"),
        Dense(2, activation='softmax')
    ])

    ann_model.compile(SGD(lr = .05), "binary_crossentropy", metrics=["accuracy"])
    ann_model.summary()
    run_history = ann_model.fit(np.array(list(Like.X_train)),np_utils.to_categorical(Like.y_train), validation_data=(np.array(list(Like.X_test)), np_utils.to_categorical(Like.y_test)), epochs=30, batch_size=30)
    joblib.dump(ann_model, "./RichANN.joblib")
    y_predict_ANN = ann_model.predict(np.array(list(Like.X_test)))[:,1]
    np.save("y_predict_ANN",np.array(y_predict_ANN) )
    return ann_model, run_history


def Run_RandomForest():
    rf_model = RandomForestClassifier(n_estimators=500, verbose=True, n_jobs = -1)
    rf_model.fit(Like.X_train[:10000], Like.y_train[:10000])
    joblib.dump(rf_model, "./RichRFClassifier.joblib")
    y_predict_rf = rf_model.predict_proba(Like.X_test)[:,1]
    np.save("y_predict_rf",np.array(y_predict_rf) )
    return rf_model


def Run_GraadientBoost():
    model_clf = xgb.XGBClassifier()
    model_hdl = ModelHandler(model_clf, TrainVar)


    hyper_pars_ranges = {'n_estimators': (200, 1000), 'max_depth': (
        2, 4), 'learning_rate': (0.01, 0.1)}
    model_hdl.optimize_params_optuna(train_test_data, hyper_pars_ranges, cross_val_scoring='roc_auc', timeout=120,
                                     n_jobs=-1, n_trials=100, direction='maximize')

    model_hdl.train_test_model(train_test_data)
    y_predict_gb = model_hdl.predict(train_test_data[2], False)
    joblib.dump(model_hdl.model, "./RichGradientBoostingClassifier.joblib")
    np.save("y_predict_gb",np.array(y_predict_gb) )
    return model_hdl.model


def Run_Nearest():
    clf = KNeighborsClassifier(20)
    clf.fit(Like.X_train, Like.y_train)
    y_predict = clf.predict_proba(Like.X_test)[:,1]
    np.save("y_predict_nn",np.array(y_predict))


def Run_SVC():
    clf = SVC(gamma=2, C=1, verbose=True)
    clf.fit(Like.X_train, Like.y_train)
    y_predict = clf.predict_proba(Like.X_test)[:,1]
    np.save("y_predict_svc",np.array(y_predict))


def Run_MLP():
    clf = MLPClassifier(alpha=1, max_iter=1000, verbose=True)
    clf.fit(Like.X_train, Like.y_train)
    y_predict = clf.predict_proba(Like.X_test)[:,1]
    np.save("y_predict_mlp",np.array(y_predict))

def Run_GNB():
    clf = GaussianNB()
    clf.fit(Like.X_train, Like.y_train)
    y_predict = clf.predict_proba(Like.X_test)[:,1]
    np.save("y_predict_gnb",np.array(y_predict))

def Run_QDA():
    clf = QuadraticDiscriminantAnalysis()
    clf.fit(Like.X_train, Like.y_train)
    y_predict = clf.predict_proba(Like.X_test)[:,1]
    np.save("y_predict_qda",np.array(y_predict))

# GaussianProcessClassifier(1.0 * RBF(1.0)),

# MLPClassifier(alpha=1, max_iter=1000),
# AdaBoostClassifier(),
# GaussianNB(),
# QuadraticDiscriminantAnalysis()


######################################################################


def PlotALL():

    fig, axs = plt.subplot_mosaic([['a)']], constrained_layout=True, figsize=(4,4))

    for proba, labelz in  zip([y_predict_ANN, y_predict_rf, y_predict_gb, -y_predict_RICHANN, y_predict_nn, y_predict_gnb, y_predict_mlp , y_predict_qda],['ANN','Random Forest','XGBoost','ANN in cbmroot','Nearest nabours', "GaussianNB", 'MLP', 'QDA']):
        PionSupression, ElectronEfficency, _ = roc_curve(Y, proba)
        axs['a)'].plot(PionSupression,ElectronEfficency,label = labelz, lw = 2)


    axs['a)'].set_xlabel("Pion Impurity")
    axs['a)'].set_ylabel("Electron Efficiency")
    axs['a)'].set_ylim([0,1])
    axs['a)'].set_xlim([0,1])
    axs['a)'].grid()
    axs['a)'].legend(loc='lower right')
    plt.savefig("ROCsRichAll.pdf")
    plt.show()


def PlotElectronEffvsPionSupQuality():
    xlim = 0.05
    fig, axs = plt.subplot_mosaic([['a)','b)']], constrained_layout=True, figsize=(9,4))


    for proba, labelz in  zip([ y_predict_rf, y_predict_gb, y_predict_Likelihood],['Random Forest','XGBoost','Likelihood']):
        PionSupression, ElectronEfficency, _ = roc_curve(Y, proba)
        axs['a)'].plot(PionSupression,ElectronEfficency,label = labelz)
        axs['b)'].plot(PionSupression,ElectronEfficency)
        QualityList = [Quality(e,p) for e,p in zip(ElectronEfficency,PionSupression)]
        MaxIndx = QualityList.index(max(QualityList))
        axs['b)'].scatter([PionSupression[MaxIndx]],[ElectronEfficency[MaxIndx]] ,label = "max. Significance in " + labelz)
        print('the Best cut vaule is s=',MaxIndx*0.01)
        print('The Best Electron Efficiency is',ElectronEfficency[MaxIndx])
        print('With an Pion suppression of',PionSupression[MaxIndx])


    x = np.linspace(0, xlim, 100)
    y = np.linspace(0, 1, 100)
    z = np.array([Quality(e,p) for e in y for p in x])
    Z = z.reshape(100, 100)
    X1, Y1 = np.meshgrid(x, y)
    c = axs['b)'].pcolormesh(X1, Y1, Z,alpha=0.5)
    cbar = fig.colorbar(c, ax=axs['b)'])
    cbar.set_label('Significance', rotation=270 , labelpad=15)


    axs['a)'].set_xlabel("Hadron Impurity")
    axs['a)'].set_ylabel("Electron Efficiency")
    axs['a)'].set_ylim([0,1])
    axs['a)'].set_xlim([0,1])
    axs['a)'].grid()
    axs['a)'].legend(loc='lower right')
    axs['b)'].legend(loc='lower right')
    axs['b)'].set_xlabel("Hadron Impurity")
    axs['b)'].set_ylabel("Electron Efficiency")
    axs['b)'].set_ylim([0,1])
    axs['b)'].set_xlim([0,xlim])
    axs['b)'].grid()
    plt.savefig("ROCsTRD1.pdf")
    plt.show()


def PlotReport():
    fig, axs = plt.subplot_mosaic([['a)']], constrained_layout=True, figsize=(5,3))

    for proba, labelz in  zip([y_predict_ANN, y_predict_rf, y_predict_gb, -y_predict_RICHANN],['ANN','Random Forest','GradientBoost','ANN implemented in cbmroot']):
        PionSupression, ElectronEfficency, _ = roc_curve(Y, proba)
        axs['a)'].plot(PionSupression,ElectronEfficency,label = labelz)


    axs['a)'].set_xlabel("Pion Impurity")
    axs['a)'].set_ylabel("Electron Efficiency")
    axs['a)'].set_ylim([0,1])
    axs['a)'].set_xlim([0,1])
    axs['a)'].grid()
    axs['a)'].legend(loc='lower right')

    plt.savefig("ROCReport.pdf")
    plt.show()


def PlotANNoutputDesitys():
    fig, axs = plt.subplot_mosaic([['a)','b)']], constrained_layout=True, figsize=(9,4))


    for predict,label in  zip([y_predict_Likelihood, y_predict_rf, y_predict_gb ],['Likelihood','Random Forest','XGBoost']):
        pions = [predict[i] for i in range(len(Y)) if Y[i]==0]
        electrons =  [predict[i] for i in range(len(Y)) if Y[i]==1]
        axs['b)'].hist(pions, 80, density=True, histtype='step', facecolor='g',
                   alpha=0.75, label = label,linewidth=2)
        axs['a)'].hist(electrons, 80, density=True, histtype='step', facecolor='g',
                   alpha=0.75, label= label, linewidth=2)
    axs['b)'].set_title("Pions")
    axs['a)'].set_title("Electrons")
    axs['a)'].set_xlabel("Electron Probability")
    axs['b)'].set_xlabel("Electron Probability")
    axs['a)'].set_ylabel("Denesty")
    axs['a)'].grid()
    axs['b)'].grid()
    plt.legend()
    plt.savefig("DensTRD.pdf")
    plt.show()


def PlotEtinne126():
    p = train_test_data[2]['P']
    y = y_predict_gb
    tf = train_test_data[3].astype(np.bool)
    fig, axs = plt.subplot_mosaic([['a)', 'b)' ,'c)']], constrained_layout=True, figsize=(9,3.3))
    nullfmt = NullFormatter()
    axs['c)'].yaxis.set_major_formatter(nullfmt)
    axs['b)'].yaxis.set_major_formatter(nullfmt)
    axs['a)'].locator_params(axis='x', nbins=4)
    axs['a)'].set_ylim([0,1])
    H, xedges, yedges = np.histogram2d(list(compress(p, tf)), list(compress(y, tf)), normed=1 ,range = [[0, 10], [0, 1]], bins=50)
    H = H.T
    H = H / H.max(axis=0)
    axs['a)'].pcolormesh(xedges,yedges,H)
    axs['a)'].set_xlabel('Momentum in GeV/c')
    axs['a)'].set_ylabel("XGBoost Output")
    axs['a)'].set_title("Electrons")
    axs['b)'].set_ylim([0,1])
    H, xedges, yedges = np.histogram2d(list(compress(p, ~tf)),  list(compress(y, ~tf)) , normed=1 ,range = [[0, 10], [0, 1]], bins=50)
    H = H.T
    H = H / H.max(axis=0)
    axs['b)'].pcolormesh(xedges,yedges,H)
    axs['b)'].set_xlabel('Momentum in GeV/c')
    axs['b)'].set_title("Pions")

    axs['c)'].hist(list(compress(y, tf)), 80,range=[0, 1],density=True, histtype='step', facecolor='g',
               alpha=0.75, label = "Electrons", orientation='horizontal',linewidth=2)
    axs['c)'].hist( list(compress(y, ~tf)) , 80,  range=[0, 1],density=True, histtype='step', facecolor='g',
               alpha=0.75, label = "Pions", orientation='horizontal',linewidth=2)
    axs['c)'].set_xlabel('Density')

    axs['c)'].legend(loc='center right')
    axs['c)'].set_ylim([0,1])
    axs['c)'].set_xlim([0,30])
    plt.savefig("Etienne126RF.pdf")
    plt.show()


def Plot_History():
    fig, axs = plt.subplot_mosaic([['a)','b)']], constrained_layout=True, figsize=(9,4))

    for label, ax in axs.items():
        # label physical distance in and down:
        trans = mtransforms.ScaledTranslation(10/72, -5/72, fig.dpi_scale_trans)
        ax.text(0.0, 1.0, label, transform=ax.transAxes + trans,
                fontsize=12, verticalalignment='top', fontfamily='serif',
                bbox=dict(facecolor='0.7', edgecolor='none', pad=6.0))

    axs['a)'].plot(run_history.history["loss"],'r', marker='.', label="Train Loss")
    axs['a)'].plot(run_history.history["val_loss"],'b', marker='.', label="Validation Loss")
    axs['a)'].set_xlabel("Epochs")
    axs['a)'].set_ylabel('loss')
    axs['a)'].legend(loc='upper right')
    plot_model(ann_model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
    axs['b)'].imshow(mpimg.imread('model_plot.png'))
    axs['b)'].set_axis_off()
    plt.savefig("RichHist.pdf")
    plt.show()


##########################Main######################

Quality = lambda e,p: e**2/np.sqrt((13.5*p+1*e)**2) if (e!=0 or p!=0) else 0

# rf_model = Run_RandomForest()
# clf = Run_GraadientBoost()
# ann_model, run_history = runANN()
# Run_Nearest()
# Run_QDA()
# Run_MLP()
# Run_GNB()

Y = np.load("Y.npy")
y_predict_gnb = np.load("y_predict_gnb.npy")
y_predict_mlp = np.load("y_predict_mlp.npy")
y_predict_qda = np.load("y_predict_qda.npy")
y_predict_nn = np.load("y_predict_nn.npy")
y_predict_rf = np.load("y_predict_rf.npy")
y_predict_gb = np.load("y_predict_gb.npy")
y_predict_ANN = np.load("y_predict_ANN.npy")
y_predict_Likelihood= np.load("y_predict_Likelihood.npy" )


PlotElectronEffvsPionSupQuality()
PlotANNoutputDesitys()
PlotEtinne126()
Plot_History()

PlotReport()