In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
from sklearn import model_selection
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier

# Function to run XGBoost model and return results

In [3]:
def XGBoost_model(pos_addr, neg_addr, groups):
        
    df = pd.DataFrame()
    models = []
    No_pos = []
    No_neg = []
    neg2pos = []
    accuracy = []
    f1 = []
    precision = []
    recall = []
    roc_auc = []

    for group in groups:

        df_positive = pd.read_csv( pos_addr + group + ".txt", sep=",", index_col=False, header=None)
        df_negative = pd.read_csv( neg_addr + group + ".txt", sep=",", index_col=False, header=None)

        df_positive[300]=1
        df_negative[300]=0

        df2 = pd.concat([df_positive, df_negative], axis=0)

        X = df2.drop(300, axis=1)
        y = df2[300]

        # under sample the number of negative data to 5 times of positve, if negative/positive > 5
        if float(len(df_negative)/len(df_positive)) > 5:
            undersample = RandomUnderSampler(sampling_strategy=0.2)
            X, y = undersample.fit_resample(X, y)
            
        if float(len(df_negative))/(len(df_positive))>5:
            temp= 5
        else:
            temp=float(len(df_negative))/(len(df_positive))

        model = XGBClassifier(eval_metric='logloss', use_label_encoder =False)
        metrics = model_selection.cross_validate(model, X, y, scoring=['accuracy','f1_weighted',\
                 'precision_weighted', 'recall_weighted', 'roc_auc'], cv=10)
        
        models.append(group)
        No_pos.append(len(df_positive))
        No_neg.append(len(df_negative))
        neg2pos.append(temp)
        accuracy.append(metrics["test_accuracy"].mean())
        f1.append(metrics["test_f1_weighted"].mean())
        precision.append(metrics["test_precision_weighted"].mean())
        recall.append(metrics["test_recall_weighted"].mean())
        roc_auc.append(metrics["test_roc_auc"].mean())
        

    df["Model"] = models
    df["No. of positive sites"] = No_pos
    df["No. of negative sites"] = No_neg
    df["Neg/pos"] = neg2pos
    df["Accuracy"] = accuracy
    df["Weighted f1 score"] =f1
    df["Weighted precision"] = precision
    df["Weighted recall"] = recall
    df["ROC_AUC"] = roc_auc  
    
    return df

# Kinase group level results

In [None]:
groups = ['TK', 'CMGC', 'AGC', 'STE', 'CK1', 'CAMK', 'Other', 'TKL', 'Atypical', 'PKL']

pos_addr = "./features/group_positive_features/"
neg_addr = "./features/group_negative_features/"

results = XGBoost_model(pos_addr, neg_addr, groups)
results.to_csv("./results/Group STY results with 10-fold cross validation.csv", index = False)

# Kinase family level results

In [None]:
families = np.load("family 15.npy", allow_pickle=True)
families_st = np.load("family_st 15.npy", allow_pickle=True)
families_y = np.load("family_y 15.npy", allow_pickle=True)

##### S/T/Y sites

In [None]:
pos_addr = "./features/family_all_positive_features/"
neg_addr = "./features/family_all_negative_features/"

results = XGBoost_model(pos_addr, neg_addr, families)
results.to_csv("./results/Family STY results with 10-fold cross validation.csv", index = False)

##### S/T sites

In [None]:
pos_addr = "./features/family_st_positive_features/"
neg_addr = "./features/family_st_negative_features/"

results = XGBoost_model(pos_addr, neg_addr, families_st)
results.to_csv("./results/Family ST results with 10-fold cross validation.csv", index = False)

##### Y site

In [None]:
pos_addr = "./features/family_y_positive_features/"
neg_addr = "./features/family_y_negative_features/"

results = XGBoost_model(pos_addr, neg_addr, families_y)
results.to_csv("./results/Family Y results with 10-fold cross validation.csv", index = False)

# Individual kinase level results

In [None]:
kinases = np.load("kinase 15.npy", allow_pickle=True)
kinases_st = np.load("kinase_st 15.npy", allow_pickle=True)
kinases_y = np.load("kinase_y 15.npy", allow_pickle=True)

##### S/T/Y sites

In [None]:
pos_addr = "./features/kinase_all_positive_features/"
neg_addr = "./features/kinase_all_negative_features/"

results = XGBoost_model(pos_addr, neg_addr, kinases)
results.to_csv("./results/kinase STY results with 10-fold cross validation.csv", index = False)

##### S/T site

In [None]:
pos_addr = "./features/kinase_st_positive_features/"
neg_addr = "./features/kinase_st_negative_features/"

results = XGBoost_model(pos_addr, neg_addr, kinases_st)
results.to_csv("./results/kinase ST results with 10-fold cross validation.csv", index = False)

##### Y site

In [None]:
pos_addr = "./features/kinase_y_positive_features/"
neg_addr = "./features/kinase_y_negative_features/"

results = XGBoost_model(pos_addr, neg_addr, kinases_y)
results.to_csv("./results/kinase Y results with 10-fold cross validation.csv", index = False)