Title
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
thal = pd.read_csv('thalcleaned_outliers.csv')
thal2 = pd.read_csv('thalcleaned_outliers2.csv')
thal.describe()
thal2.describe()
thal.drop(columns='no', inplace=True)
thal.describe()
thal.drop?
thal.info()
thal.astype({'phenotype' : 'category', 'sex' : 'category'})
thal['rbc'] = thal['rbc'].fillna(thal.groupby('phenotype')['rbc'].transform('mean'))
thal['mch'] = thal['mch'].fillna(thal.groupby('phenotype')['mch'].transform('mean'))
thal.info()
thal = thal.astype({"phenotype" : "category", "sex" : "category"})
thal.info()
- 243 out 288 was obtained - after removing elements where there was no diagnosis.
thal.sex
thal.loc['sex' == 'male'] #this is WRONG
SEE THIS LINK TO KNOW WHY ABOVE IS WRONG - https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html
female_pct = (thal['sex'] == 'female').mean()*100
female_pct
thal.age.mean()
d_list = list(thal.phenotype.unique())
d_list
thal.phenotype.value_counts()
plt.style.use('ggplot')
plt.rcParams['font.size'] = '18'
plt.pie(thal.phenotype.value_counts(),counterclock=False, startangle=90, autopct = '%.2f%%')
#plt.figure(figsize=(16, 10))from pylab import rcParams
from pylab import rcParams
rcParams['figure.figsize'] = 16,10
plt.legend(labels=['Alpha trait', 'Silent carrier', 'Normal', 'Iron deficiency', 'Beta trait'])
plt.style.use('ggplot')
plt.rcParams['font.size'] = '18'
plt.pie(alphanorm.phenotype.value_counts(),counterclock=False, startangle=90, autopct = '%.2f%%')
#plt.figure(figsize=(16, 10))from pylab import rcParams
from pylab import rcParams
rcParams['figure.figsize'] = 16,10
plt.legend(labels=['Alpha-thalassemia carriers', 'Normal'])
female_pct = (alphanorm['sex'] == 'female').mean()*100
female_pct
Final dataset, after removing samples without a specific diagnosis, consisted of 243 samples. Basic demographics are depicted in the Figure 1. Males represented just over half of the dataset, and the mean age was......
alphanorm = pd.read_csv('alphanorm.csv', index_col = False)
alphanorm.info()
alphanorm.describe()
alphanorm.head()
- only 203 when considering alpha carries and normals
alphanorm['rbc'] = alphanorm['rbc'].fillna(alphanorm.groupby('phenotype')['rbc'].transform('mean'))
alphanorm['mch'] = alphanorm['mch'].fillna(alphanorm.groupby('phenotype')['mch'].transform('mean'))
alphanorm.phenotype.value_counts()
alphanorm.info()
Changing fonts in each elements of a plot : https://stackoverflow.com/questions/12444716/how-do-i-set-the-figure-title-and-axes-labels-font-size-in-matplotlib
#will include these variables as well
alphanorm = alphanorm.drop(['hbf', 'wbc', 'neut', 'lymph'], axis=1)
alphanorm.dtypes
alphanorm.describe()
for col in alphanorm.columns:
if alphanorm[col].dtype != object:
Q1 = alphanorm[col].quantile(0.25)
Q3 = alphanorm[col].quantile(0.75)
IQR = Q3 - Q1
S = 1.5*IQR
LB = Q1 - S
UB = Q3 + S
print(UB)
alphanorm.loc[alphanorm[col] > UB,col] = UB
alphanorm.loc[alphanorm[col] < LB,col] = LB
alphanorm.describe()
alphanorm.info()
alphanorm = alphanorm.astype({'sex' : 'category', 'phenotype' : 'category'})
alphanorm.phenotype.value_counts()
alphanorm['phenotype'] = alphanorm['phenotype'] == 'alpha carrier'
alphanorm['phenotype'] = alphanorm['phenotype'].replace({True:1, False:0})
alphanorm.head(200)
Converting boolean column to int : https://stackoverflow.com/questions/17383094/how-can-i-map-true-false-to-1-0-in-a-pandas-dataframe
X = alphanorm.drop('phenotype', axis=1)
y = alphanorm['phenotype']
y.value_counts()
categorical_vars = list((X.select_dtypes(include=['category'])).columns)
categorical_vars
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_vars)],
remainder= 'passthrough')
#transformed_alpha = transformer.fit_transform(alphanorm)
#transformed_alpha
df_transformed = pd.DataFrame(transformed_alpha)
df_transformed.head()
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
for train_index, test_index in split.split(alphanorm, alphanorm["phenotype"]):
strat_train = alphanorm.loc[train_index]
strat_test = alphanorm.loc[test_index]
strat_train.head(100)
train_X = strat_train.drop('phenotype', axis=1)
train_y = strat_train['phenotype']
test_x = strat_test.drop('phenotype', axis=1)
test_y = strat_test['phenotype']
one_hot1 = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot1, categorical_vars)],
remainder= 'passthrough')
trans_trainX = transformer.fit_transform(train_X)
trans_trainX_df = pd.DataFrame(trans_trainX)
one_hot2 = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot2, categorical_vars)],
remainder= 'passthrough')
trans_testX = transformer.fit_transform(test_x)
trans_testX_df = pd.DataFrame(trans_testX)
trans_testX_df.head()
trans_trainX_df.head()
train_y.unique
using cross_val_score with stratified folds - https://stackoverflow.com/questions/59002684/forcing-sklearn-cross-val-score-to-use-stratified-k-fold
results
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(verbose=True, oob_score=True, n_estimators=500, max_features='log2', n_jobs=-1, min_samples_leaf=4)
model.fit(trans_trainX_df, train_y)
y_scores = model.predict(trans_trainX_df)
from sklearn.metrics import classification_report
print(classification_report(y_scores, train_y))
y_scores_valid = model.predict(trans_testX_df)
print(classification_report(y_scores_valid,test_y))
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, 600], 'max_features': [0.5, 'sqrt', 'log2'], 'min_samples_leaf':[4],
'n_jobs':[-1]},
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_features': [0.5, 'sqrt', 'log2'],
'n_jobs':[-1]},
]
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5,
scoring='accuracy',
return_train_score=True)
grid_search.fit(trans_trainX_df, train_y)
grid_search.best_params_
model = RandomForestClassifier(verbose=True, oob_score=True, n_estimators=200, max_features='log2', n_jobs=-1, min_samples_leaf=4)
model.fit(trans_trainX_df, train_y)
from sklearn.metrics import f1_score
f1_score(model.predict(trans_testX_df), test_y)
from sklearn.metrics import precision_score
precision_score(model.predict(trans_testX_df), test_y)
from sklearn.metrics import recall_score
recall_score(model.predict(trans_testX_df), test_y)
y_scores = model.predict(trans_trainX_df)
print(classification_report(y_scores, train_y))
y_scores_valid = model.predict(trans_testX_df)
print(classification_report(y_scores_valid,test_y))
from sklearn.metrics import precision_recall_curve
y_scores = model.predict_proba(trans_testX_df)
#precision_recall_curve(test_y, y_scores)
model1.classes_
y_scores
y_scores = y_scores[:, 1]
precisions, recalls, thresholds = precision_recall_curve(test_y, y_scores)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.legend(loc="upper left")
# highlight the threshold and add the legend, axis label, and grid
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(test_y, y_scores)
def plot_roc_curve(fpr, tpr, label=None, figure=(12,10)):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal # Add axis labels and grid
#plt.figure(figsize=figure)
plt.xlabel('True Positive Rate(Recall)')
plt.ylabel("False Positive Rate(1-specificity)")
plt.legend()
plot_roc_curve(fpr, tpr)
from sklearn.metrics import roc_auc_score
roc_auc_score(test_y, y_scores)
import joblib
joblib.dump(model1, "thal_rf200.pkl")
# and later...
#my_model_loaded = joblib.load("my_model.pkl")
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
confused = confusion_matrix(model.predict(trans_testX_df), test_y)
confused
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix (model, trans_testX_df, test_y, cmap=plt.cm.Blues, display_labels=['normals', 'alpha carriers'], ax=ax)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_sample(trans_trainX_df, train_y)
y_train_res.value_counts()
model = RandomForestClassifier(verbose=True, oob_score=True, n_estimators=400, max_features=0.5, n_jobs=-1, min_samples_leaf=4)
model.fit(X_train_res, y_train_res)
print(classification_report(model.predict(X_train_res), y_train_res))
print(classification_report(model.predict(trans_testX_df), test_y))
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix (model, trans_testX_df, test_y, cmap=plt.cm.Blues, display_labels=['normals', 'alpha carriers'], ax=ax)
from xgboost import XGBClassifier as xgb
xgbmodel = xgb()
xgbmodel.fit(X_train_res, y_train_res)
print(classification_report(xgbmodel.predict(X_train_res), y_train_res))
print(classification_report(xgbmodel.predict(trans_testX_df), test_y))
param_grid = [
{'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25], 'max_features': [0.5, 'sqrt', 'log2'], 'min_samples_leaf':[4],
'n_jobs':[-1]},
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_features': [0.5, 'sqrt', 'log2'],
'n_jobs':[-1]},
]
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5,
scoring='accuracy',
return_train_score=True)
grid_search.fit(trans_trainX_df, train_y)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
# define model
model2 = RandomForestClassifier()
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model2, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))