import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns

thal = pd.read_csv('thalcleaned_outliers.csv')
thal2 = pd.read_csv('thalcleaned_outliers2.csv')

thal.describe()

thal2.describe()

thal.drop(columns='no', inplace=True)

thal.describe()

thal.drop?

thal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        243 non-null    object 
 1   hb         243 non-null    float64
 2   pcv        243 non-null    float64
 3   rbc        242 non-null    float64
 4   mcv        243 non-null    float64
 5   mch        241 non-null    float64
 6   mchc       243 non-null    float64
 7   rdw        243 non-null    float64
 8   wbc        243 non-null    float64
 9   neut       243 non-null    float64
 10  lymph      243 non-null    float64
 11  plt        243 non-null    float64
 12  hba        243 non-null    float64
 13  hba2       243 non-null    float64
 14  hbf        243 non-null    float64
 15  phenotype  243 non-null    object 
dtypes: float64(14), object(2)
memory usage: 30.5+ KB

thal.astype({'phenotype' : 'category', 'sex' : 'category'})

thal['rbc'] = thal['rbc'].fillna(thal.groupby('phenotype')['rbc'].transform('mean'))
thal['mch'] = thal['mch'].fillna(thal.groupby('phenotype')['mch'].transform('mean'))

thal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        243 non-null    object 
 1   hb         243 non-null    float64
 2   pcv        243 non-null    float64
 3   rbc        243 non-null    float64
 4   mcv        243 non-null    float64
 5   mch        243 non-null    float64
 6   mchc       243 non-null    float64
 7   rdw        243 non-null    float64
 8   wbc        243 non-null    float64
 9   neut       243 non-null    float64
 10  lymph      243 non-null    float64
 11  plt        243 non-null    float64
 12  hba        243 non-null    float64
 13  hba2       243 non-null    float64
 14  hbf        243 non-null    float64
 15  phenotype  243 non-null    object 
dtypes: float64(14), object(2)
memory usage: 30.5+ KB

thal = thal.astype({"phenotype" : "category", "sex" : "category"})

thal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   sex        243 non-null    category
 1   hb         243 non-null    float64 
 2   pcv        243 non-null    float64 
 3   rbc        243 non-null    float64 
 4   mcv        243 non-null    float64 
 5   mch        243 non-null    float64 
 6   mchc       243 non-null    float64 
 7   rdw        243 non-null    float64 
 8   wbc        243 non-null    float64 
 9   neut       243 non-null    float64 
 10  lymph      243 non-null    float64 
 11  plt        243 non-null    float64 
 12  hba        243 non-null    float64 
 13  hba2       243 non-null    float64 
 14  hbf        243 non-null    float64 
 15  phenotype  243 non-null    category
dtypes: category(2), float64(14)
memory usage: 27.5 KB

243 out 288 was obtained - after removing elements where there was no diagnosis.

thal.sex

0      female
1        male
2      female
3        male
4        male
        ...  
238      male
239    female
240      male
241      male
242      male
Name: sex, Length: 243, dtype: category
Categories (2, object): ['female', 'male']

thal.loc['sex' == 'male'] #this is WRONG

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-15-f64bf44e4e9a> in <module>
      1 #calculating the male percentage
----> 2 thal.loc['sex' == 'male'] #this is WRONG

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
    893 
    894             maybe_callable = com.apply_if_callable(key, self.obj)
--> 895             return self._getitem_axis(maybe_callable, axis=axis)
    896 
    897     def _is_scalar_access(self, key: Tuple):

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1122         # fall thru to straight lookup
   1123         self._validate_key(key, axis)
-> 1124         return self._get_label(key, axis=axis)
   1125 
   1126     def _get_slice_axis(self, slice_obj: slice, axis: int):

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
   1071     def _get_label(self, label, axis: int):
   1072         # GH#5667 this will fail if the label is not present in the axis.
-> 1073         return self.obj.xs(label, axis=axis)
   1074 
   1075     def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):

~\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
   3737                 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
   3738         else:
-> 3739             loc = index.get_loc(key)
   3740 
   3741             if isinstance(loc, np.ndarray):

~\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
    352                 except ValueError as err:
    353                     raise KeyError(key) from err
--> 354             raise KeyError(key)
    355         return super().get_loc(key, method=method, tolerance=tolerance)
    356 

KeyError: False

SEE THIS LINK TO KNOW WHY ABOVE IS WRONG - https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html

female_pct = (thal['sex'] == 'female').mean()*100
female_pct

45.67901234567901

thal.age.mean()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-17-9ca328a0bd78> in <module>
----> 1 thal.age.mean()

~\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   5463             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5464                 return self[name]
-> 5465             return object.__getattribute__(self, name)
   5466 
   5467     def __setattr__(self, name: str, value) -> None:

AttributeError: 'DataFrame' object has no attribute 'age'

d_list = list(thal.phenotype.unique())
d_list

['alpha trait', 'silent carrier', 'normal', 'iron deficiency', 'beta trait']

thal.phenotype.value_counts()

silent carrier     105
normal              58
alpha trait         42
iron deficiency     21
beta trait          17
Name: phenotype, dtype: int64

plt.style.use('ggplot')
plt.rcParams['font.size'] = '18'
plt.pie(thal.phenotype.value_counts(),counterclock=False, startangle=90, autopct = '%.2f%%')
#plt.figure(figsize=(16, 10))from pylab import rcParams
from pylab import rcParams
rcParams['figure.figsize'] = 16,10
plt.legend(labels=['Alpha trait', 'Silent carrier', 'Normal', 'Iron deficiency', 'Beta trait'])

<matplotlib.legend.Legend at 0x28169e47488>

plt.style.use('ggplot')
plt.rcParams['font.size'] = '18'
plt.pie(alphanorm.phenotype.value_counts(),counterclock=False, startangle=90, autopct = '%.2f%%')
#plt.figure(figsize=(16, 10))from pylab import rcParams
from pylab import rcParams
rcParams['figure.figsize'] = 16,10
plt.legend(labels=['Alpha-thalassemia carriers', 'Normal'])

<matplotlib.legend.Legend at 0x2816a097148>

female_pct = (alphanorm['sex'] == 'female').mean()*100
female_pct

44.827586206896555

Final dataset, after removing samples without a specific diagnosis, consisted of 243 samples. Basic demographics are depicted in the Figure 1. Males represented just over half of the dataset, and the mean age was......

Model 01 Analysis - AlphaThal vs Normal

alphanorm = pd.read_csv('alphanorm.csv', index_col = False)

alphanorm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        203 non-null    object 
 1   hb         203 non-null    float64
 2   pcv        203 non-null    float64
 3   rbc        202 non-null    float64
 4   mcv        203 non-null    float64
 5   mch        201 non-null    float64
 6   mchc       203 non-null    float64
 7   rdw        203 non-null    float64
 8   wbc        203 non-null    float64
 9   neut       203 non-null    float64
 10  lymph      203 non-null    float64
 11  plt        203 non-null    float64
 12  hba        203 non-null    float64
 13  hba2       203 non-null    float64
 14  hbf        203 non-null    float64
 15  phenotype  203 non-null    object 
dtypes: float64(14), object(2)
memory usage: 25.5+ KB

alphanorm.describe()

alphanorm.head()

only 203 when considering alpha carries and normals

alphanorm['rbc'] = alphanorm['rbc'].fillna(alphanorm.groupby('phenotype')['rbc'].transform('mean'))
alphanorm['mch'] = alphanorm['mch'].fillna(alphanorm.groupby('phenotype')['mch'].transform('mean'))

alphanorm.phenotype.value_counts()

alpha carrier    148
normal            55
Name: phenotype, dtype: int64

alphanorm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        203 non-null    object 
 1   hb         203 non-null    float64
 2   pcv        203 non-null    float64
 3   rbc        203 non-null    float64
 4   mcv        203 non-null    float64
 5   mch        203 non-null    float64
 6   mchc       203 non-null    float64
 7   rdw        203 non-null    float64
 8   wbc        203 non-null    float64
 9   neut       203 non-null    float64
 10  lymph      203 non-null    float64
 11  plt        203 non-null    float64
 12  hba        203 non-null    float64
 13  hba2       203 non-null    float64
 14  hbf        203 non-null    float64
 15  phenotype  203 non-null    object 
dtypes: float64(14), object(2)
memory usage: 25.5+ KB

Changing fonts in each elements of a plot : https://stackoverflow.com/questions/12444716/how-do-i-set-the-figure-title-and-axes-labels-font-size-in-matplotlib

#will include these variables as well
alphanorm = alphanorm.drop(['hbf', 'wbc', 'neut', 'lymph'], axis=1)

alphanorm.dtypes

sex           object
hb           float64
pcv          float64
rbc          float64
mcv          float64
mch          float64
mchc         float64
rdw          float64
plt          float64
hba          float64
hba2         float64
phenotype     object
dtype: object

alphanorm.describe()

for col in alphanorm.columns:
    if alphanorm[col].dtype != object:
        Q1 = alphanorm[col].quantile(0.25)
        Q3 = alphanorm[col].quantile(0.75)
        IQR = Q3 - Q1
        S = 1.5*IQR
        LB = Q1 - S
        UB = Q3 + S
        print(UB)
        alphanorm.loc[alphanorm[col] > UB,col] = UB
        alphanorm.loc[alphanorm[col] < LB,col] = LB

17.025000000000002
47.92500000000002
6.537500000000001
104.32500000000003
35.125
36.29074075
19.924999999999997
564.75
90.61428571249999
3.0000000000000004

alphanorm.describe()

alphanorm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        203 non-null    object 
 1   hb         203 non-null    float64
 2   pcv        203 non-null    float64
 3   rbc        203 non-null    float64
 4   mcv        203 non-null    float64
 5   mch        203 non-null    float64
 6   mchc       203 non-null    float64
 7   rdw        203 non-null    float64
 8   plt        203 non-null    float64
 9   hba        203 non-null    float64
 10  hba2       203 non-null    float64
 11  phenotype  203 non-null    object 
dtypes: float64(10), object(2)
memory usage: 19.2+ KB

alphanorm = alphanorm.astype({'sex' : 'category', 'phenotype' : 'category'})

alphanorm.phenotype.value_counts()

alpha carrier    148
normal            55
Name: phenotype, dtype: int64

alphanorm['phenotype'] = alphanorm['phenotype'] == 'alpha carrier'
alphanorm['phenotype'] = alphanorm['phenotype'].replace({True:1, False:0})

alphanorm.head(200)

Converting boolean column to int : https://stackoverflow.com/questions/17383094/how-can-i-map-true-false-to-1-0-in-a-pandas-dataframe

X = alphanorm.drop('phenotype', axis=1)
y = alphanorm['phenotype']

y.value_counts()

1    148
0     55
Name: phenotype, dtype: int64

categorical_vars = list((X.select_dtypes(include=['category'])).columns)
categorical_vars

['sex']

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_vars)], 
                                remainder= 'passthrough')
#transformed_alpha = transformer.fit_transform(alphanorm)
#transformed_alpha

df_transformed = pd.DataFrame(transformed_alpha)
df_transformed.head()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-24-82979c2eac64> in <module>
----> 1 df_transformed = pd.DataFrame(transformed_alpha)
      2 df_transformed.head()

NameError: name 'transformed_alpha' is not defined

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
for train_index, test_index in split.split(alphanorm, alphanorm["phenotype"]):
    strat_train = alphanorm.loc[train_index]
    strat_test = alphanorm.loc[test_index]

strat_train.head(100)

train_X = strat_train.drop('phenotype', axis=1)
train_y = strat_train['phenotype']
test_x = strat_test.drop('phenotype', axis=1)
test_y = strat_test['phenotype']

one_hot1 = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot1, categorical_vars)], 
                                remainder= 'passthrough')
trans_trainX = transformer.fit_transform(train_X)
trans_trainX_df = pd.DataFrame(trans_trainX)

one_hot2 = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot2, categorical_vars)], 
                                remainder= 'passthrough')
trans_testX = transformer.fit_transform(test_x)
trans_testX_df = pd.DataFrame(trans_testX)

trans_testX_df.head()

trans_trainX_df.head()

train_y.unique

<bound method Series.unique of 181    0
171    0
95     1
41     1
54     1
      ..
93     1
20     1
85     1
77     1
143    1
Name: phenotype, Length: 162, dtype: int64>

using cross_val_score with stratified folds - https://stackoverflow.com/questions/59002684/forcing-sklearn-cross-val-score-to-use-stratified-k-fold

results

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-213-100f62972f2f> in <module>
----> 1 results

NameError: name 'results' is not defined

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(verbose=True, oob_score=True, n_estimators=500, max_features='log2', n_jobs=-1, min_samples_leaf=4)
model.fit(trans_trainX_df, train_y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    2.1s finished

RandomForestClassifier(max_features='log2', min_samples_leaf=4,
                       n_estimators=500, n_jobs=-1, oob_score=True,
                       verbose=True)

y_scores = model.predict(trans_trainX_df)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:    0.1s finished

from sklearn.metrics import classification_report
print(classification_report(y_scores, train_y))

              precision    recall  f1-score   support

           0       0.57      1.00      0.72        25
           1       1.00      0.86      0.93       137

    accuracy                           0.88       162
   macro avg       0.78      0.93      0.83       162
weighted avg       0.93      0.88      0.89       162

y_scores_valid = model.predict(trans_testX_df)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:    0.1s finished

print(classification_report(y_scores_valid,test_y))

              precision    recall  f1-score   support

           0       0.18      0.67      0.29         3
           1       0.97      0.76      0.85        38

    accuracy                           0.76        41
   macro avg       0.57      0.71      0.57        41
weighted avg       0.91      0.76      0.81        41

from sklearn.model_selection import GridSearchCV

param_grid = [
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, 600], 'max_features': [0.5, 'sqrt', 'log2'], 'min_samples_leaf':[4], 
 'n_jobs':[-1]},
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_features': [0.5, 'sqrt', 'log2'],  
 'n_jobs':[-1]}, 
   ]

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5,
scoring='accuracy',
return_train_score=True)
grid_search.fit(trans_trainX_df, train_y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid=[{'max_features': [0.5, 'sqrt', 'log2'],
                          'min_samples_leaf': [4],
                          'n_estimators': [100, 150, 200, 250, 300, 350, 400,
                                           450, 500, 600],
                          'n_jobs': [-1]},
                         {'max_features': [0.5, 'sqrt', 'log2'],
                          'n_estimators': [100, 150, 200, 250, 300, 350, 400,
                                           450, 500],
                          'n_jobs': [-1]}],
             return_train_score=True, scoring='accuracy')

grid_search.best_params_

{'max_features': 'log2',
 'min_samples_leaf': 4,
 'n_estimators': 200,
 'n_jobs': -1}

model = RandomForestClassifier(verbose=True, oob_score=True, n_estimators=200, max_features='log2', n_jobs=-1, min_samples_leaf=4)
model.fit(trans_trainX_df, train_y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    0.7s finished

RandomForestClassifier(max_features='log2', min_samples_leaf=4,
                       n_estimators=200, n_jobs=-1, oob_score=True,
                       verbose=True)

from sklearn.metrics import f1_score

f1_score(model.predict(trans_testX_df), test_y)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished

0.8695652173913044

from sklearn.metrics import precision_score
precision_score(model.predict(trans_testX_df), test_y)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished

1.0

from sklearn.metrics import recall_score
recall_score(model.predict(trans_testX_df), test_y)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished

0.7692307692307693

y_scores = model.predict(trans_trainX_df)
print(classification_report(y_scores, train_y))

              precision    recall  f1-score   support

           0       0.61      1.00      0.76        27
           1       1.00      0.87      0.93       135

    accuracy                           0.90       162
   macro avg       0.81      0.94      0.85       162
weighted avg       0.94      0.90      0.90       162

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished

y_scores_valid = model.predict(trans_testX_df)
print(classification_report(y_scores_valid,test_y))

              precision    recall  f1-score   support

           0       0.18      1.00      0.31         2
           1       1.00      0.77      0.87        39

    accuracy                           0.78        41
   macro avg       0.59      0.88      0.59        41
weighted avg       0.96      0.78      0.84        41

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished

from sklearn.metrics import precision_recall_curve
y_scores = model.predict_proba(trans_testX_df)
#precision_recall_curve(test_y, y_scores)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished

model1.classes_

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-140-b81240a189d5> in <module>
----> 1 model1.classes_

NameError: name 'model1' is not defined

y_scores

array([[0.39250572, 0.60749428],
       [0.25320079, 0.74679921],
       [0.1713831 , 0.8286169 ],
       [0.17947367, 0.82052633],
       [0.489447  , 0.510553  ],
       [0.56838878, 0.43161122],
       [0.20219407, 0.79780593],
       [0.35415521, 0.64584479],
       [0.31536785, 0.68463215],
       [0.2188261 , 0.7811739 ],
       [0.3774125 , 0.6225875 ],
       [0.37452597, 0.62547403],
       [0.37466037, 0.62533963],
       [0.46889146, 0.53110854],
       [0.27090818, 0.72909182],
       [0.36832506, 0.63167494],
       [0.26114827, 0.73885173],
       [0.48415451, 0.51584549],
       [0.17540571, 0.82459429],
       [0.49270883, 0.50729117],
       [0.23196986, 0.76803014],
       [0.08120887, 0.91879113],
       [0.30504464, 0.69495536],
       [0.22847752, 0.77152248],
       [0.29418737, 0.70581263],
       [0.41770902, 0.58229098],
       [0.2760021 , 0.7239979 ],
       [0.40566932, 0.59433068],
       [0.45339157, 0.54660843],
       [0.3564696 , 0.6435304 ],
       [0.4670073 , 0.5329927 ],
       [0.34411017, 0.65588983],
       [0.16982323, 0.83017677],
       [0.19704983, 0.80295017],
       [0.35555874, 0.64444126],
       [0.37553992, 0.62446008],
       [0.38274799, 0.61725201],
       [0.38877291, 0.61122709],
       [0.50700195, 0.49299805],
       [0.27680689, 0.72319311],
       [0.45167561, 0.54832439]])

y_scores = y_scores[:, 1]

precisions, recalls, thresholds = precision_recall_curve(test_y, y_scores)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.legend(loc="upper left")
 # highlight the threshold and add the legend, axis label, and grid

plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(test_y, y_scores)

def plot_roc_curve(fpr, tpr, label=None, figure=(12,10)):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal # Add axis labels and grid
    #plt.figure(figsize=figure)
    plt.xlabel('True Positive Rate(Recall)')
    plt.ylabel("False Positive Rate(1-specificity)")
    plt.legend()
plot_roc_curve(fpr, tpr)

No handles with labels found to put in legend.

from sklearn.metrics import roc_auc_score
roc_auc_score(test_y, y_scores)

0.7303030303030303

import joblib
joblib.dump(model1, "thal_rf200.pkl")
# and later...
#my_model_loaded = joblib.load("my_model.pkl")

['thal_rf200.pkl']

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
confused = confusion_matrix(model.predict(trans_testX_df), test_y)
confused

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished

array([[ 2,  0],
       [ 9, 30]], dtype=int64)

fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix (model, trans_testX_df, test_y, cmap=plt.cm.Blues, display_labels=['normals', 'alpha carriers'], ax=ax)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x20d3a29a308>

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_sample(trans_trainX_df, train_y)

y_train_res.value_counts()

0    118
1    118
Name: phenotype, dtype: int64

model = RandomForestClassifier(verbose=True, oob_score=True, n_estimators=400, max_features=0.5, n_jobs=-1, min_samples_leaf=4)
model.fit(X_train_res, y_train_res)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.3s finished

RandomForestClassifier(max_features=0.5, min_samples_leaf=4, n_estimators=400,
                       n_jobs=-1, oob_score=True, verbose=True)

print(classification_report(model.predict(X_train_res), y_train_res))

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       117
           1       0.97      0.97      0.97       119

    accuracy                           0.97       236
   macro avg       0.97      0.97      0.97       236
weighted avg       0.97      0.97      0.97       236

[Parallel(n_jobs=12)]: Done 400 out of 400 | elapsed:    0.2s finished

print(classification_report(model.predict(trans_testX_df), test_y))

              precision    recall  f1-score   support

           0       0.73      0.47      0.57        17
           1       0.70      0.88      0.78        24

    accuracy                           0.71        41
   macro avg       0.71      0.67      0.67        41
weighted avg       0.71      0.71      0.69        41

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 400 out of 400 | elapsed:    0.0s finished

fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix (model, trans_testX_df, test_y, cmap=plt.cm.Blues, display_labels=['normals', 'alpha carriers'], ax=ax)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 400 out of 400 | elapsed:    0.0s finished

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x20d3bbe2388>

from xgboost import XGBClassifier as xgb

xgbmodel = xgb()
xgbmodel.fit(X_train_res, y_train_res)

XGBClassifier()

print(classification_report(xgbmodel.predict(X_train_res), y_train_res))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       117
           1       1.00      0.99      1.00       119

    accuracy                           1.00       236
   macro avg       1.00      1.00      1.00       236
weighted avg       1.00      1.00      1.00       236

print(classification_report(xgbmodel.predict(trans_testX_df), test_y))

              precision    recall  f1-score   support

           0       0.64      0.54      0.58        13
           1       0.80      0.86      0.83        28

    accuracy                           0.76        41
   macro avg       0.72      0.70      0.71        41
weighted avg       0.75      0.76      0.75        41

param_grid = [
{'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25], 'max_features': [0.5, 'sqrt', 'log2'], 'min_samples_leaf':[4], 
 'n_jobs':[-1]},
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_features': [0.5, 'sqrt', 'log2'],  
 'n_jobs':[-1]}, 
   ]

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5,
scoring='accuracy',
return_train_score=True)
grid_search.fit(trans_trainX_df, train_y)

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# define model
model2 = RandomForestClassifier()
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model2, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))

	no	hb	pcv	rbc	mcv	mch	mchc	rdw	wbc	neut	lymph	plt	hba	hba2	hbf
count	243.000000	243.000000	243.000000	242.000000	243.000000	241.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000
mean	143.514403	11.799712	36.109301	5.086123	72.251140	23.452697	32.209671	15.208232	101.878340	46.986866	42.840218	329.805780	86.098491	2.751195	0.664832
std	83.962622	1.864599	4.979802	0.651812	9.785879	3.982755	2.123092	2.498933	1343.898711	12.194046	12.057044	122.601262	4.736257	0.712732	0.703808
min	0.000000	6.000000	19.870000	2.410000	47.700000	11.100000	19.400000	10.800000	2.300000	6.200000	10.300000	8.000000	25.000000	0.300000	0.000000
25%	72.000000	10.650000	33.110000	4.690000	65.400000	20.700000	31.200000	13.500000	7.375000	43.989189	36.135000	258.000000	84.864586	2.500000	0.300000
50%	143.000000	11.700000	35.876404	5.080000	70.500000	23.000000	32.300000	14.827660	8.900000	47.565000	41.537931	309.000000	86.523291	2.600000	0.537931
75%	217.500000	13.000000	38.900000	5.500000	80.450000	26.400000	33.400000	16.300000	10.450000	54.600000	45.527027	386.000000	87.331429	2.750000	0.769231
max	287.000000	16.700000	51.100000	7.470000	91.700000	35.600000	40.800000	28.800000	20900.000000	77.500000	87.000000	1107.000000	98.400000	5.800000	5.800000

	Unnamed: 0	hb	pcv	rbc	mcv	mch	mchc	rdw	wbc	neut	lymph	plt	hba	hba2	hbf
count	243.000000	243.000000	243.000000	242.000000	243.000000	242.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000
mean	121.000000	11.799712	36.109301	5.086123	72.251140	23.446438	32.209671	15.208232	101.878340	46.986866	42.840218	329.805780	86.098491	2.751195	0.664832
std	70.292247	1.864599	4.979802	0.651812	9.785879	3.975676	2.123092	2.498933	1343.898711	12.194046	12.057044	122.601262	4.736257	0.712732	0.703808
min	0.000000	6.000000	19.870000	2.410000	47.700000	11.100000	19.400000	10.800000	2.300000	6.200000	10.300000	8.000000	25.000000	0.300000	0.000000
25%	60.500000	10.650000	33.110000	4.690000	65.400000	20.725000	31.200000	13.500000	7.375000	43.989189	36.135000	258.000000	84.864586	2.500000	0.300000
50%	121.000000	11.700000	35.876404	5.080000	70.500000	23.000000	32.300000	14.827660	8.900000	47.565000	41.537931	309.000000	86.523291	2.600000	0.537931
75%	181.500000	13.000000	38.900000	5.500000	80.450000	26.400000	33.400000	16.300000	10.450000	54.600000	45.527027	386.000000	87.331429	2.750000	0.769231
max	242.000000	16.700000	51.100000	7.470000	91.700000	35.600000	40.800000	28.800000	20900.000000	77.500000	87.000000	1107.000000	98.400000	5.800000	5.800000

	hb	pcv	rbc	mcv	mch	mchc	rdw	wbc	neut	lymph	plt	hba	hba2	hbf
count	243.000000	243.000000	242.000000	243.000000	241.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000	243.000000
mean	11.799712	36.109301	5.086123	72.251140	23.452697	32.209671	15.208232	101.878340	46.986866	42.840218	329.805780	86.098491	2.751195	0.664832
std	1.864599	4.979802	0.651812	9.785879	3.982755	2.123092	2.498933	1343.898711	12.194046	12.057044	122.601262	4.736257	0.712732	0.703808
min	6.000000	19.870000	2.410000	47.700000	11.100000	19.400000	10.800000	2.300000	6.200000	10.300000	8.000000	25.000000	0.300000	0.000000
25%	10.650000	33.110000	4.690000	65.400000	20.700000	31.200000	13.500000	7.375000	43.989189	36.135000	258.000000	84.864586	2.500000	0.300000
50%	11.700000	35.876404	5.080000	70.500000	23.000000	32.300000	14.827660	8.900000	47.565000	41.537931	309.000000	86.523291	2.600000	0.537931
75%	13.000000	38.900000	5.500000	80.450000	26.400000	33.400000	16.300000	10.450000	54.600000	45.527027	386.000000	87.331429	2.750000	0.769231
max	16.700000	51.100000	7.470000	91.700000	35.600000	40.800000	28.800000	20900.000000	77.500000	87.000000	1107.000000	98.400000	5.800000	5.800000

	sex	hb	pcv	rbc	mcv	mch	mchc	rdw	wbc	neut	lymph	plt	hba	hba2	hbf	phenotype
0	female	10.8	35.2	5.12	68.7	21.2	30.8	13.4	9.6	53.000000	33.000000	309.0	88.500000	2.600000	0.110000	alpha trait
1	male	10.8	26.6	4.28	62.1	25.3	40.8	19.8	10.3	49.400000	43.100000	687.0	87.800000	2.400000	0.900000	alpha trait
2	female	10.8	35.2	5.12	68.7	21.2	30.8	13.4	9.6	53.000000	33.000000	309.0	88.500000	2.600000	0.100000	silent carrier
3	male	14.5	43.5	5.17	84.0	28.0	33.4	12.1	11.9	31.000000	50.000000	334.0	86.800000	2.800000	0.300000	silent carrier
4	male	11.5	34.4	5.02	68.7	22.9	33.4	15.7	20.4	67.000000	30.000000	596.0	86.300000	2.400000	1.300000	silent carrier
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
238	male	15.5	45.9	5.19	88.4	29.9	33.8	12.6	8.8	47.565000	40.975000	177.0	88.600000	3.200000	0.400000	normal
239	female	10.4	33.3	4.93	67.6	21.1	31.2	14.8	8.9	44.478378	45.527027	295.0	88.000000	2.400000	0.500000	silent carrier
240	male	9.8	29.8	4.75	62.7	19.0	30.4	14.7	7.2	48.234483	41.537931	262.0	85.100000	2.400000	1.100000	alpha trait
241	male	11.2	37.2	5.43	68.5	20.6	30.1	15.1	12.0	13.500000	76.800000	277.0	86.523291	2.588608	0.769231	silent carrier
242	male	14.4	44.5	5.70	78.0	25.3	31.2	15.0	7.2	36.000000	59.000000	224.0	86.523291	2.588608	0.769231	silent carrier

	hb	pcv	rbc	mcv	mch	mchc	rdw	wbc	neut	lymph	plt	hba	hba2	hbf
count	203.000000	203.000000	202.000000	203.000000	201.000000	203.000000	203.000000	203.000000	203.00000	203.000000	203.000000	203.000000	203.000000	203.000000
mean	12.111823	36.676757	5.057432	74.167128	24.200498	32.497322	14.848380	9.188423	46.08398	43.516162	328.265663	86.515666	2.579554	0.664367
std	1.757800	4.821295	0.585929	9.280344	3.805844	1.979277	2.381027	2.591782	11.79773	11.992417	114.284337	2.436432	0.312889	0.739903
min	7.600000	22.100000	2.410000	47.700000	11.100000	21.100000	10.800000	2.300000	6.20000	10.300000	100.000000	68.000000	0.300000	0.000000
25%	10.900000	33.300000	4.700000	66.950000	21.200000	31.550000	13.300000	7.500000	43.00000	37.000000	256.000000	85.200000	2.500000	0.300000
50%	12.000000	36.000000	5.025000	73.800000	24.100000	32.500000	14.700000	8.915278	47.56500	41.537931	310.000000	86.523291	2.600000	0.537931
75%	13.350000	39.150000	5.437500	81.900000	26.800000	33.446296	15.950000	10.550000	52.15000	46.000000	379.500000	87.365714	2.700000	0.769231
max	16.700000	51.100000	6.770000	91.700000	35.600000	40.800000	28.800000	20.400000	77.50000	87.000000	1107.000000	98.400000	3.300000	5.800000

	sex	hb	pcv	rbc	mcv	mch	mchc	rdw	plt	hba	hba2	phenotype
181	female	11.3	35.900000	4.71	76.0	24.0	31.5	14.0	268.00000	86.523291	2.588608	0
171	female	11.6	36.000000	4.69	77.0	25.0	32.2	13.3	249.00000	86.600000	2.500000	0
95	female	13.8	35.876404	5.81	74.0	23.7	31.6	11.8	312.00000	87.100000	2.600000	1
41	male	11.8	36.000000	5.27	68.3	22.4	32.5	14.8	311.00000	86.600000	2.700000	1
54	male	13.4	42.600000	5.13	83.0	26.2	31.6	12.0	235.00000	84.864586	2.679310	1
...	...	...	...	...	...	...	...	...	...	...	...	...
33	female	9.1	29.200000	4.38	61.9	19.3	31.1	16.1	410.00000	81.951429	2.200000	1
152	female	11.0	33.000000	5.29	62.4	20.8	33.3	16.0	335.00000	86.400000	3.000000	0
0	female	10.8	35.200000	5.12	68.7	21.2	30.8	13.4	309.00000	88.500000	2.600000	1
157	male	15.2	35.876404	6.09	72.6	25.0	34.4	12.2	332.27957	87.300000	2.600000	0
98	female	8.8	26.500000	3.81	69.6	23.0	33.0	14.7	180.00000	86.523291	2.588608	1

	0	1	2	3	4	5	6	7	8	9	10	11
0	0.0	1.0	14.7	45.0	5.50	81.0	26.7	32.6	11.7	228.0	84.864586	2.67931
1	0.0	1.0	12.1	41.5	5.50	75.5	23.8	31.6	14.1	266.0	84.864586	2.67931
2	0.0	1.0	13.5	40.0	4.60	86.9	29.4	33.8	10.8	180.0	84.864586	2.67931
3	0.0	1.0	14.3	43.6	4.92	88.7	29.0	32.7	13.5	265.0	84.864586	2.67931
4	1.0	0.0	10.2	34.4	5.25	65.5	19.5	29.7	15.2	562.0	86.000000	2.90000