import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
thal = pd.read_csv('thalcleaned_outliers.csv')
thal2 = pd.read_csv('thalcleaned_outliers2.csv')
thal.describe()
no hb pcv rbc mcv mch mchc rdw wbc neut lymph plt hba hba2 hbf
count 243.000000 243.000000 243.000000 242.000000 243.000000 241.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000
mean 143.514403 11.799712 36.109301 5.086123 72.251140 23.452697 32.209671 15.208232 101.878340 46.986866 42.840218 329.805780 86.098491 2.751195 0.664832
std 83.962622 1.864599 4.979802 0.651812 9.785879 3.982755 2.123092 2.498933 1343.898711 12.194046 12.057044 122.601262 4.736257 0.712732 0.703808
min 0.000000 6.000000 19.870000 2.410000 47.700000 11.100000 19.400000 10.800000 2.300000 6.200000 10.300000 8.000000 25.000000 0.300000 0.000000
25% 72.000000 10.650000 33.110000 4.690000 65.400000 20.700000 31.200000 13.500000 7.375000 43.989189 36.135000 258.000000 84.864586 2.500000 0.300000
50% 143.000000 11.700000 35.876404 5.080000 70.500000 23.000000 32.300000 14.827660 8.900000 47.565000 41.537931 309.000000 86.523291 2.600000 0.537931
75% 217.500000 13.000000 38.900000 5.500000 80.450000 26.400000 33.400000 16.300000 10.450000 54.600000 45.527027 386.000000 87.331429 2.750000 0.769231
max 287.000000 16.700000 51.100000 7.470000 91.700000 35.600000 40.800000 28.800000 20900.000000 77.500000 87.000000 1107.000000 98.400000 5.800000 5.800000
thal2.describe()
Unnamed: 0 hb pcv rbc mcv mch mchc rdw wbc neut lymph plt hba hba2 hbf
count 243.000000 243.000000 243.000000 242.000000 243.000000 242.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000
mean 121.000000 11.799712 36.109301 5.086123 72.251140 23.446438 32.209671 15.208232 101.878340 46.986866 42.840218 329.805780 86.098491 2.751195 0.664832
std 70.292247 1.864599 4.979802 0.651812 9.785879 3.975676 2.123092 2.498933 1343.898711 12.194046 12.057044 122.601262 4.736257 0.712732 0.703808
min 0.000000 6.000000 19.870000 2.410000 47.700000 11.100000 19.400000 10.800000 2.300000 6.200000 10.300000 8.000000 25.000000 0.300000 0.000000
25% 60.500000 10.650000 33.110000 4.690000 65.400000 20.725000 31.200000 13.500000 7.375000 43.989189 36.135000 258.000000 84.864586 2.500000 0.300000
50% 121.000000 11.700000 35.876404 5.080000 70.500000 23.000000 32.300000 14.827660 8.900000 47.565000 41.537931 309.000000 86.523291 2.600000 0.537931
75% 181.500000 13.000000 38.900000 5.500000 80.450000 26.400000 33.400000 16.300000 10.450000 54.600000 45.527027 386.000000 87.331429 2.750000 0.769231
max 242.000000 16.700000 51.100000 7.470000 91.700000 35.600000 40.800000 28.800000 20900.000000 77.500000 87.000000 1107.000000 98.400000 5.800000 5.800000
thal.drop(columns='no', inplace=True)
thal.describe()
hb pcv rbc mcv mch mchc rdw wbc neut lymph plt hba hba2 hbf
count 243.000000 243.000000 242.000000 243.000000 241.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000 243.000000
mean 11.799712 36.109301 5.086123 72.251140 23.452697 32.209671 15.208232 101.878340 46.986866 42.840218 329.805780 86.098491 2.751195 0.664832
std 1.864599 4.979802 0.651812 9.785879 3.982755 2.123092 2.498933 1343.898711 12.194046 12.057044 122.601262 4.736257 0.712732 0.703808
min 6.000000 19.870000 2.410000 47.700000 11.100000 19.400000 10.800000 2.300000 6.200000 10.300000 8.000000 25.000000 0.300000 0.000000
25% 10.650000 33.110000 4.690000 65.400000 20.700000 31.200000 13.500000 7.375000 43.989189 36.135000 258.000000 84.864586 2.500000 0.300000
50% 11.700000 35.876404 5.080000 70.500000 23.000000 32.300000 14.827660 8.900000 47.565000 41.537931 309.000000 86.523291 2.600000 0.537931
75% 13.000000 38.900000 5.500000 80.450000 26.400000 33.400000 16.300000 10.450000 54.600000 45.527027 386.000000 87.331429 2.750000 0.769231
max 16.700000 51.100000 7.470000 91.700000 35.600000 40.800000 28.800000 20900.000000 77.500000 87.000000 1107.000000 98.400000 5.800000 5.800000
thal.drop?
thal.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        243 non-null    object 
 1   hb         243 non-null    float64
 2   pcv        243 non-null    float64
 3   rbc        242 non-null    float64
 4   mcv        243 non-null    float64
 5   mch        241 non-null    float64
 6   mchc       243 non-null    float64
 7   rdw        243 non-null    float64
 8   wbc        243 non-null    float64
 9   neut       243 non-null    float64
 10  lymph      243 non-null    float64
 11  plt        243 non-null    float64
 12  hba        243 non-null    float64
 13  hba2       243 non-null    float64
 14  hbf        243 non-null    float64
 15  phenotype  243 non-null    object 
dtypes: float64(14), object(2)
memory usage: 30.5+ KB
thal.astype({'phenotype' : 'category', 'sex' : 'category'})
sex hb pcv rbc mcv mch mchc rdw wbc neut lymph plt hba hba2 hbf phenotype
0 female 10.8 35.2 5.12 68.7 21.2 30.8 13.4 9.6 53.000000 33.000000 309.0 88.500000 2.600000 0.110000 alpha trait
1 male 10.8 26.6 4.28 62.1 25.3 40.8 19.8 10.3 49.400000 43.100000 687.0 87.800000 2.400000 0.900000 alpha trait
2 female 10.8 35.2 5.12 68.7 21.2 30.8 13.4 9.6 53.000000 33.000000 309.0 88.500000 2.600000 0.100000 silent carrier
3 male 14.5 43.5 5.17 84.0 28.0 33.4 12.1 11.9 31.000000 50.000000 334.0 86.800000 2.800000 0.300000 silent carrier
4 male 11.5 34.4 5.02 68.7 22.9 33.4 15.7 20.4 67.000000 30.000000 596.0 86.300000 2.400000 1.300000 silent carrier
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
238 male 15.5 45.9 5.19 88.4 29.9 33.8 12.6 8.8 47.565000 40.975000 177.0 88.600000 3.200000 0.400000 normal
239 female 10.4 33.3 4.93 67.6 21.1 31.2 14.8 8.9 44.478378 45.527027 295.0 88.000000 2.400000 0.500000 silent carrier
240 male 9.8 29.8 4.75 62.7 19.0 30.4 14.7 7.2 48.234483 41.537931 262.0 85.100000 2.400000 1.100000 alpha trait
241 male 11.2 37.2 5.43 68.5 20.6 30.1 15.1 12.0 13.500000 76.800000 277.0 86.523291 2.588608 0.769231 silent carrier
242 male 14.4 44.5 5.70 78.0 25.3 31.2 15.0 7.2 36.000000 59.000000 224.0 86.523291 2.588608 0.769231 silent carrier

243 rows × 16 columns

thal['rbc'] = thal['rbc'].fillna(thal.groupby('phenotype')['rbc'].transform('mean'))
thal['mch'] = thal['mch'].fillna(thal.groupby('phenotype')['mch'].transform('mean'))
thal.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        243 non-null    object 
 1   hb         243 non-null    float64
 2   pcv        243 non-null    float64
 3   rbc        243 non-null    float64
 4   mcv        243 non-null    float64
 5   mch        243 non-null    float64
 6   mchc       243 non-null    float64
 7   rdw        243 non-null    float64
 8   wbc        243 non-null    float64
 9   neut       243 non-null    float64
 10  lymph      243 non-null    float64
 11  plt        243 non-null    float64
 12  hba        243 non-null    float64
 13  hba2       243 non-null    float64
 14  hbf        243 non-null    float64
 15  phenotype  243 non-null    object 
dtypes: float64(14), object(2)
memory usage: 30.5+ KB
thal = thal.astype({"phenotype" : "category", "sex" : "category"})
thal.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   sex        243 non-null    category
 1   hb         243 non-null    float64 
 2   pcv        243 non-null    float64 
 3   rbc        243 non-null    float64 
 4   mcv        243 non-null    float64 
 5   mch        243 non-null    float64 
 6   mchc       243 non-null    float64 
 7   rdw        243 non-null    float64 
 8   wbc        243 non-null    float64 
 9   neut       243 non-null    float64 
 10  lymph      243 non-null    float64 
 11  plt        243 non-null    float64 
 12  hba        243 non-null    float64 
 13  hba2       243 non-null    float64 
 14  hbf        243 non-null    float64 
 15  phenotype  243 non-null    category
dtypes: category(2), float64(14)
memory usage: 27.5 KB
  • 243 out 288 was obtained - after removing elements where there was no diagnosis.
thal.sex
0      female
1        male
2      female
3        male
4        male
        ...  
238      male
239    female
240      male
241      male
242      male
Name: sex, Length: 243, dtype: category
Categories (2, object): ['female', 'male']
thal.loc['sex' == 'male'] #this is WRONG
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-15-f64bf44e4e9a> in <module>
      1 #calculating the male percentage
----> 2 thal.loc['sex' == 'male'] #this is WRONG

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
    893 
    894             maybe_callable = com.apply_if_callable(key, self.obj)
--> 895             return self._getitem_axis(maybe_callable, axis=axis)
    896 
    897     def _is_scalar_access(self, key: Tuple):

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1122         # fall thru to straight lookup
   1123         self._validate_key(key, axis)
-> 1124         return self._get_label(key, axis=axis)
   1125 
   1126     def _get_slice_axis(self, slice_obj: slice, axis: int):

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
   1071     def _get_label(self, label, axis: int):
   1072         # GH#5667 this will fail if the label is not present in the axis.
-> 1073         return self.obj.xs(label, axis=axis)
   1074 
   1075     def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):

~\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
   3737                 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
   3738         else:
-> 3739             loc = index.get_loc(key)
   3740 
   3741             if isinstance(loc, np.ndarray):

~\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
    352                 except ValueError as err:
    353                     raise KeyError(key) from err
--> 354             raise KeyError(key)
    355         return super().get_loc(key, method=method, tolerance=tolerance)
    356 

KeyError: False
female_pct = (thal['sex'] == 'female').mean()*100
female_pct
45.67901234567901
thal.age.mean()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-17-9ca328a0bd78> in <module>
----> 1 thal.age.mean()

~\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   5463             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5464                 return self[name]
-> 5465             return object.__getattribute__(self, name)
   5466 
   5467     def __setattr__(self, name: str, value) -> None:

AttributeError: 'DataFrame' object has no attribute 'age'
d_list = list(thal.phenotype.unique())
d_list
['alpha trait', 'silent carrier', 'normal', 'iron deficiency', 'beta trait']
thal.phenotype.value_counts()
silent carrier     105
normal              58
alpha trait         42
iron deficiency     21
beta trait          17
Name: phenotype, dtype: int64
plt.style.use('ggplot')
plt.rcParams['font.size'] = '18'
plt.pie(thal.phenotype.value_counts(),counterclock=False, startangle=90, autopct = '%.2f%%')
#plt.figure(figsize=(16, 10))from pylab import rcParams
from pylab import rcParams
rcParams['figure.figsize'] = 16,10
plt.legend(labels=['Alpha trait', 'Silent carrier', 'Normal', 'Iron deficiency', 'Beta trait'])
<matplotlib.legend.Legend at 0x28169e47488>
plt.style.use('ggplot')
plt.rcParams['font.size'] = '18'
plt.pie(alphanorm.phenotype.value_counts(),counterclock=False, startangle=90, autopct = '%.2f%%')
#plt.figure(figsize=(16, 10))from pylab import rcParams
from pylab import rcParams
rcParams['figure.figsize'] = 16,10
plt.legend(labels=['Alpha-thalassemia carriers', 'Normal'])
<matplotlib.legend.Legend at 0x2816a097148>
female_pct = (alphanorm['sex'] == 'female').mean()*100
female_pct
44.827586206896555

Final dataset, after removing samples without a specific diagnosis, consisted of 243 samples. Basic demographics are depicted in the Figure 1. Males represented just over half of the dataset, and the mean age was......

Model 01 Analysis - AlphaThal vs Normal

 
alphanorm = pd.read_csv('alphanorm.csv', index_col = False)
alphanorm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        203 non-null    object 
 1   hb         203 non-null    float64
 2   pcv        203 non-null    float64
 3   rbc        202 non-null    float64
 4   mcv        203 non-null    float64
 5   mch        201 non-null    float64
 6   mchc       203 non-null    float64
 7   rdw        203 non-null    float64
 8   wbc        203 non-null    float64
 9   neut       203 non-null    float64
 10  lymph      203 non-null    float64
 11  plt        203 non-null    float64
 12  hba        203 non-null    float64
 13  hba2       203 non-null    float64
 14  hbf        203 non-null    float64
 15  phenotype  203 non-null    object 
dtypes: float64(14), object(2)
memory usage: 25.5+ KB
alphanorm.describe()
hb pcv rbc mcv mch mchc rdw wbc neut lymph plt hba hba2 hbf
count 203.000000 203.000000 202.000000 203.000000 201.000000 203.000000 203.000000 203.000000 203.00000 203.000000 203.000000 203.000000 203.000000 203.000000
mean 12.111823 36.676757 5.057432 74.167128 24.200498 32.497322 14.848380 9.188423 46.08398 43.516162 328.265663 86.515666 2.579554 0.664367
std 1.757800 4.821295 0.585929 9.280344 3.805844 1.979277 2.381027 2.591782 11.79773 11.992417 114.284337 2.436432 0.312889 0.739903
min 7.600000 22.100000 2.410000 47.700000 11.100000 21.100000 10.800000 2.300000 6.20000 10.300000 100.000000 68.000000 0.300000 0.000000
25% 10.900000 33.300000 4.700000 66.950000 21.200000 31.550000 13.300000 7.500000 43.00000 37.000000 256.000000 85.200000 2.500000 0.300000
50% 12.000000 36.000000 5.025000 73.800000 24.100000 32.500000 14.700000 8.915278 47.56500 41.537931 310.000000 86.523291 2.600000 0.537931
75% 13.350000 39.150000 5.437500 81.900000 26.800000 33.446296 15.950000 10.550000 52.15000 46.000000 379.500000 87.365714 2.700000 0.769231
max 16.700000 51.100000 6.770000 91.700000 35.600000 40.800000 28.800000 20.400000 77.50000 87.000000 1107.000000 98.400000 3.300000 5.800000
alphanorm.head()
sex hb pcv rbc mcv mch mchc rdw wbc neut lymph plt hba hba2 hbf phenotype
0 female 10.8 35.2 5.12 68.7 21.2 30.8 13.4 9.6 53.0 33.0 309.0 88.5 2.6 0.11 alpha carrier
1 male 10.8 26.6 4.28 62.1 25.3 40.8 19.8 10.3 49.4 43.1 687.0 87.8 2.4 0.90 alpha carrier
2 female 10.8 35.2 5.12 68.7 21.2 30.8 13.4 9.6 53.0 33.0 309.0 88.5 2.6 0.10 alpha carrier
3 male 14.5 43.5 5.17 84.0 28.0 33.4 12.1 11.9 31.0 50.0 334.0 86.8 2.8 0.30 alpha carrier
4 male 11.5 34.4 5.02 68.7 22.9 33.4 15.7 20.4 67.0 30.0 596.0 86.3 2.4 1.30 alpha carrier
  • only 203 when considering alpha carries and normals
alphanorm['rbc'] = alphanorm['rbc'].fillna(alphanorm.groupby('phenotype')['rbc'].transform('mean'))
alphanorm['mch'] = alphanorm['mch'].fillna(alphanorm.groupby('phenotype')['mch'].transform('mean'))
alphanorm.phenotype.value_counts()
alpha carrier    148
normal            55
Name: phenotype, dtype: int64
alphanorm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        203 non-null    object 
 1   hb         203 non-null    float64
 2   pcv        203 non-null    float64
 3   rbc        203 non-null    float64
 4   mcv        203 non-null    float64
 5   mch        203 non-null    float64
 6   mchc       203 non-null    float64
 7   rdw        203 non-null    float64
 8   wbc        203 non-null    float64
 9   neut       203 non-null    float64
 10  lymph      203 non-null    float64
 11  plt        203 non-null    float64
 12  hba        203 non-null    float64
 13  hba2       203 non-null    float64
 14  hbf        203 non-null    float64
 15  phenotype  203 non-null    object 
dtypes: float64(14), object(2)
memory usage: 25.5+ KB
#will include these variables as well
alphanorm = alphanorm.drop(['hbf', 'wbc', 'neut', 'lymph'], axis=1)
alphanorm.dtypes
sex           object
hb           float64
pcv          float64
rbc          float64
mcv          float64
mch          float64
mchc         float64
rdw          float64
plt          float64
hba          float64
hba2         float64
phenotype     object
dtype: object
alphanorm.describe()
hb pcv rbc mcv mch mchc rdw plt hba hba2
count 203.000000 203.000000 203.000000 203.000000 203.000000 203.000000 203.000000 203.000000 203.000000 203.000000
mean 12.111823 36.676757 5.057293 74.167128 24.202443 32.497322 14.848380 328.265663 86.515666 2.579554
std 1.757800 4.821295 0.584480 9.280344 3.787007 1.979277 2.381027 114.284337 2.436432 0.312889
min 7.600000 22.100000 2.410000 47.700000 11.100000 21.100000 10.800000 100.000000 68.000000 0.300000
25% 10.900000 33.300000 4.700000 66.950000 21.250000 31.550000 13.300000 256.000000 85.200000 2.500000
50% 12.000000 36.000000 5.029125 73.800000 24.200000 32.500000 14.700000 310.000000 86.523291 2.600000
75% 13.350000 39.150000 5.435000 81.900000 26.800000 33.446296 15.950000 379.500000 87.365714 2.700000
max 16.700000 51.100000 6.770000 91.700000 35.600000 40.800000 28.800000 1107.000000 98.400000 3.300000
for col in alphanorm.columns:
    if alphanorm[col].dtype != object:
        Q1 = alphanorm[col].quantile(0.25)
        Q3 = alphanorm[col].quantile(0.75)
        IQR = Q3 - Q1
        S = 1.5*IQR
        LB = Q1 - S
        UB = Q3 + S
        print(UB)
        alphanorm.loc[alphanorm[col] > UB,col] = UB
        alphanorm.loc[alphanorm[col] < LB,col] = LB
17.025000000000002
47.92500000000002
6.537500000000001
104.32500000000003
35.125
36.29074075
19.924999999999997
564.75
90.61428571249999
3.0000000000000004
alphanorm.describe()
hb pcv rbc mcv mch mchc rdw plt hba hba2
count 203.000000 203.000000 203.000000 203.000000 203.000000 203.000000 203.000000 203.000000 203.000000 203.000000
mean 12.111823 36.668013 5.061197 74.167128 24.209093 32.541985 14.739390 323.875269 86.457172 2.600737
std 1.757800 4.730102 0.557916 9.280344 3.750817 1.616079 1.971135 96.533639 1.523440 0.197139
min 7.600000 24.525000 3.597500 47.700000 12.925000 28.705556 10.800000 100.000000 81.951429 2.200000
25% 10.900000 33.300000 4.700000 66.950000 21.250000 31.550000 13.300000 256.000000 85.200000 2.500000
50% 12.000000 36.000000 5.029125 73.800000 24.200000 32.500000 14.700000 310.000000 86.523291 2.600000
75% 13.350000 39.150000 5.435000 81.900000 26.800000 33.446296 15.950000 379.500000 87.365714 2.700000
max 16.700000 47.925000 6.537500 91.700000 35.125000 36.290741 19.925000 564.750000 90.614286 3.000000
alphanorm.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sex        203 non-null    object 
 1   hb         203 non-null    float64
 2   pcv        203 non-null    float64
 3   rbc        203 non-null    float64
 4   mcv        203 non-null    float64
 5   mch        203 non-null    float64
 6   mchc       203 non-null    float64
 7   rdw        203 non-null    float64
 8   plt        203 non-null    float64
 9   hba        203 non-null    float64
 10  hba2       203 non-null    float64
 11  phenotype  203 non-null    object 
dtypes: float64(10), object(2)
memory usage: 19.2+ KB
alphanorm = alphanorm.astype({'sex' : 'category', 'phenotype' : 'category'})
alphanorm.phenotype.value_counts()
alpha carrier    148
normal            55
Name: phenotype, dtype: int64
 
alphanorm['phenotype'] = alphanorm['phenotype'] == 'alpha carrier'
alphanorm['phenotype'] = alphanorm['phenotype'].replace({True:1, False:0})
alphanorm.head(200)
sex hb pcv rbc mcv mch mchc rdw plt hba hba2 phenotype
0 female 10.8 35.2 5.12 68.7 21.2 30.800000 13.4 309.00 88.5 2.6 1
1 male 10.8 26.6 4.28 62.1 25.3 36.290741 19.8 564.75 87.8 2.4 1
2 female 10.8 35.2 5.12 68.7 21.2 30.800000 13.4 309.00 88.5 2.6 1
3 male 14.5 43.5 5.17 84.0 28.0 33.400000 12.1 334.00 86.8 2.8 1
4 male 11.5 34.4 5.02 68.7 22.9 33.400000 15.7 564.75 86.3 2.4 1
... ... ... ... ... ... ... ... ... ... ... ... ...
195 female 11.4 36.5 5.23 69.8 21.8 31.300000 15.9 528.00 85.7 2.7 0
196 female 13.1 39.9 4.88 81.8 26.9 32.800000 15.6 268.00 86.7 2.7 0
197 male 15.3 46.9 5.20 90.2 29.5 32.700000 12.7 215.00 86.7 2.6 0
198 male 15.5 45.9 5.19 88.4 29.9 33.800000 12.6 177.00 88.6 3.0 0
199 female 10.4 33.3 4.93 67.6 21.1 31.200000 14.8 295.00 88.0 2.4 0

200 rows × 12 columns

X = alphanorm.drop('phenotype', axis=1)
y = alphanorm['phenotype']
y.value_counts()
1    148
0     55
Name: phenotype, dtype: int64
categorical_vars = list((X.select_dtypes(include=['category'])).columns)
categorical_vars
['sex']
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_vars)], 
                                remainder= 'passthrough')
#transformed_alpha = transformer.fit_transform(alphanorm)
#transformed_alpha
df_transformed = pd.DataFrame(transformed_alpha)
df_transformed.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-24-82979c2eac64> in <module>
----> 1 df_transformed = pd.DataFrame(transformed_alpha)
      2 df_transformed.head()

NameError: name 'transformed_alpha' is not defined
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
for train_index, test_index in split.split(alphanorm, alphanorm["phenotype"]):
    strat_train = alphanorm.loc[train_index]
    strat_test = alphanorm.loc[test_index]
strat_train.head(100)
sex hb pcv rbc mcv mch mchc rdw plt hba hba2 phenotype
181 female 11.3 35.900000 4.71 76.0 24.0 31.5 14.0 268.00000 86.523291 2.588608 0
171 female 11.6 36.000000 4.69 77.0 25.0 32.2 13.3 249.00000 86.600000 2.500000 0
95 female 13.8 35.876404 5.81 74.0 23.7 31.6 11.8 312.00000 87.100000 2.600000 1
41 male 11.8 36.000000 5.27 68.3 22.4 32.5 14.8 311.00000 86.600000 2.700000 1
54 male 13.4 42.600000 5.13 83.0 26.2 31.6 12.0 235.00000 84.864586 2.679310 1
... ... ... ... ... ... ... ... ... ... ... ... ...
33 female 9.1 29.200000 4.38 61.9 19.3 31.1 16.1 410.00000 81.951429 2.200000 1
152 female 11.0 33.000000 5.29 62.4 20.8 33.3 16.0 335.00000 86.400000 3.000000 0
0 female 10.8 35.200000 5.12 68.7 21.2 30.8 13.4 309.00000 88.500000 2.600000 1
157 male 15.2 35.876404 6.09 72.6 25.0 34.4 12.2 332.27957 87.300000 2.600000 0
98 female 8.8 26.500000 3.81 69.6 23.0 33.0 14.7 180.00000 86.523291 2.588608 1

100 rows × 12 columns

train_X = strat_train.drop('phenotype', axis=1)
train_y = strat_train['phenotype']
test_x = strat_test.drop('phenotype', axis=1)
test_y = strat_test['phenotype']
one_hot1 = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot1, categorical_vars)], 
                                remainder= 'passthrough')
trans_trainX = transformer.fit_transform(train_X)
trans_trainX_df = pd.DataFrame(trans_trainX)
one_hot2 = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot2, categorical_vars)], 
                                remainder= 'passthrough')
trans_testX = transformer.fit_transform(test_x)
trans_testX_df = pd.DataFrame(trans_testX)
trans_testX_df.head()
0 1 2 3 4 5 6 7 8 9 10 11
0 0.0 1.0 14.7 45.0 5.50 81.0 26.7 32.6 11.7 228.0 84.864586 2.67931
1 0.0 1.0 12.1 41.5 5.50 75.5 23.8 31.6 14.1 266.0 84.864586 2.67931
2 0.0 1.0 13.5 40.0 4.60 86.9 29.4 33.8 10.8 180.0 84.864586 2.67931
3 0.0 1.0 14.3 43.6 4.92 88.7 29.0 32.7 13.5 265.0 84.864586 2.67931
4 1.0 0.0 10.2 34.4 5.25 65.5 19.5 29.7 15.2 562.0 86.000000 2.90000
trans_trainX_df.head()
0 1 2 3 4 5 6 7 8 9 10 11
0 1.0 0.0 11.3 35.900000 4.71 76.0 24.0 31.5 14.0 268.0 86.523291 2.588608
1 1.0 0.0 11.6 36.000000 4.69 77.0 25.0 32.2 13.3 249.0 86.600000 2.500000
2 1.0 0.0 13.8 35.876404 5.81 74.0 23.7 31.6 11.8 312.0 87.100000 2.600000
3 0.0 1.0 11.8 36.000000 5.27 68.3 22.4 32.5 14.8 311.0 86.600000 2.700000
4 0.0 1.0 13.4 42.600000 5.13 83.0 26.2 31.6 12.0 235.0 84.864586 2.679310
train_y.unique
<bound method Series.unique of 181    0
171    0
95     1
41     1
54     1
      ..
93     1
20     1
85     1
77     1
143    1
Name: phenotype, Length: 162, dtype: int64>
results
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-213-100f62972f2f> in <module>
----> 1 results

NameError: name 'results' is not defined
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(verbose=True, oob_score=True, n_estimators=500, max_features='log2', n_jobs=-1, min_samples_leaf=4)
model.fit(trans_trainX_df, train_y)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    2.1s finished
RandomForestClassifier(max_features='log2', min_samples_leaf=4,
                       n_estimators=500, n_jobs=-1, oob_score=True,
                       verbose=True)
y_scores = model.predict(trans_trainX_df)
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:    0.1s finished
from sklearn.metrics import classification_report
print(classification_report(y_scores, train_y))
              precision    recall  f1-score   support

           0       0.57      1.00      0.72        25
           1       1.00      0.86      0.93       137

    accuracy                           0.88       162
   macro avg       0.78      0.93      0.83       162
weighted avg       0.93      0.88      0.89       162

y_scores_valid = model.predict(trans_testX_df)
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 500 out of 500 | elapsed:    0.1s finished
print(classification_report(y_scores_valid,test_y))
              precision    recall  f1-score   support

           0       0.18      0.67      0.29         3
           1       0.97      0.76      0.85        38

    accuracy                           0.76        41
   macro avg       0.57      0.71      0.57        41
weighted avg       0.91      0.76      0.81        41

from sklearn.model_selection import GridSearchCV
 
param_grid = [
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, 600], 'max_features': [0.5, 'sqrt', 'log2'], 'min_samples_leaf':[4], 
 'n_jobs':[-1]},
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_features': [0.5, 'sqrt', 'log2'],  
 'n_jobs':[-1]}, 
   ]

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5,
scoring='accuracy',
return_train_score=True)
grid_search.fit(trans_trainX_df, train_y)
GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid=[{'max_features': [0.5, 'sqrt', 'log2'],
                          'min_samples_leaf': [4],
                          'n_estimators': [100, 150, 200, 250, 300, 350, 400,
                                           450, 500, 600],
                          'n_jobs': [-1]},
                         {'max_features': [0.5, 'sqrt', 'log2'],
                          'n_estimators': [100, 150, 200, 250, 300, 350, 400,
                                           450, 500],
                          'n_jobs': [-1]}],
             return_train_score=True, scoring='accuracy')
grid_search.best_params_
{'max_features': 'log2',
 'min_samples_leaf': 4,
 'n_estimators': 200,
 'n_jobs': -1}
model = RandomForestClassifier(verbose=True, oob_score=True, n_estimators=200, max_features='log2', n_jobs=-1, min_samples_leaf=4)
model.fit(trans_trainX_df, train_y)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    0.7s finished
RandomForestClassifier(max_features='log2', min_samples_leaf=4,
                       n_estimators=200, n_jobs=-1, oob_score=True,
                       verbose=True)
from sklearn.metrics import f1_score
f1_score(model.predict(trans_testX_df), test_y)
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished
0.8695652173913044
from sklearn.metrics import precision_score
precision_score(model.predict(trans_testX_df), test_y)
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished
1.0
from sklearn.metrics import recall_score
recall_score(model.predict(trans_testX_df), test_y)
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished
0.7692307692307693
y_scores = model.predict(trans_trainX_df)
print(classification_report(y_scores, train_y))
              precision    recall  f1-score   support

           0       0.61      1.00      0.76        27
           1       1.00      0.87      0.93       135

    accuracy                           0.90       162
   macro avg       0.81      0.94      0.85       162
weighted avg       0.94      0.90      0.90       162

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished
y_scores_valid = model.predict(trans_testX_df)
print(classification_report(y_scores_valid,test_y))
              precision    recall  f1-score   support

           0       0.18      1.00      0.31         2
           1       1.00      0.77      0.87        39

    accuracy                           0.78        41
   macro avg       0.59      0.88      0.59        41
weighted avg       0.96      0.78      0.84        41

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished
from sklearn.metrics import precision_recall_curve
y_scores = model.predict_proba(trans_testX_df)
#precision_recall_curve(test_y, y_scores)
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished
model1.classes_
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-140-b81240a189d5> in <module>
----> 1 model1.classes_

NameError: name 'model1' is not defined
y_scores
array([[0.39250572, 0.60749428],
       [0.25320079, 0.74679921],
       [0.1713831 , 0.8286169 ],
       [0.17947367, 0.82052633],
       [0.489447  , 0.510553  ],
       [0.56838878, 0.43161122],
       [0.20219407, 0.79780593],
       [0.35415521, 0.64584479],
       [0.31536785, 0.68463215],
       [0.2188261 , 0.7811739 ],
       [0.3774125 , 0.6225875 ],
       [0.37452597, 0.62547403],
       [0.37466037, 0.62533963],
       [0.46889146, 0.53110854],
       [0.27090818, 0.72909182],
       [0.36832506, 0.63167494],
       [0.26114827, 0.73885173],
       [0.48415451, 0.51584549],
       [0.17540571, 0.82459429],
       [0.49270883, 0.50729117],
       [0.23196986, 0.76803014],
       [0.08120887, 0.91879113],
       [0.30504464, 0.69495536],
       [0.22847752, 0.77152248],
       [0.29418737, 0.70581263],
       [0.41770902, 0.58229098],
       [0.2760021 , 0.7239979 ],
       [0.40566932, 0.59433068],
       [0.45339157, 0.54660843],
       [0.3564696 , 0.6435304 ],
       [0.4670073 , 0.5329927 ],
       [0.34411017, 0.65588983],
       [0.16982323, 0.83017677],
       [0.19704983, 0.80295017],
       [0.35555874, 0.64444126],
       [0.37553992, 0.62446008],
       [0.38274799, 0.61725201],
       [0.38877291, 0.61122709],
       [0.50700195, 0.49299805],
       [0.27680689, 0.72319311],
       [0.45167561, 0.54832439]])
y_scores = y_scores[:, 1]
precisions, recalls, thresholds = precision_recall_curve(test_y, y_scores)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.legend(loc="upper left")
 # highlight the threshold and add the legend, axis label, and grid
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(test_y, y_scores)
def plot_roc_curve(fpr, tpr, label=None, figure=(12,10)):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal # Add axis labels and grid
    #plt.figure(figsize=figure)
    plt.xlabel('True Positive Rate(Recall)')
    plt.ylabel("False Positive Rate(1-specificity)")
    plt.legend()
plot_roc_curve(fpr, tpr)
No handles with labels found to put in legend.
from sklearn.metrics import roc_auc_score
roc_auc_score(test_y, y_scores)
0.7303030303030303
import joblib
joblib.dump(model1, "thal_rf200.pkl")
# and later...
#my_model_loaded = joblib.load("my_model.pkl")
['thal_rf200.pkl']
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
confused = confusion_matrix(model.predict(trans_testX_df), test_y)
confused
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished
array([[ 2,  0],
       [ 9, 30]], dtype=int64)
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix (model, trans_testX_df, test_y, cmap=plt.cm.Blues, display_labels=['normals', 'alpha carriers'], ax=ax)
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.0s finished
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x20d3a29a308>
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_sample(trans_trainX_df, train_y)
  
y_train_res.value_counts()
0    118
1    118
Name: phenotype, dtype: int64
model = RandomForestClassifier(verbose=True, oob_score=True, n_estimators=400, max_features=0.5, n_jobs=-1, min_samples_leaf=4)
model.fit(X_train_res, y_train_res)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    1.3s finished
RandomForestClassifier(max_features=0.5, min_samples_leaf=4, n_estimators=400,
                       n_jobs=-1, oob_score=True, verbose=True)
https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
print(classification_report(model.predict(X_train_res), y_train_res))
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       117
           1       0.97      0.97      0.97       119

    accuracy                           0.97       236
   macro avg       0.97      0.97      0.97       236
weighted avg       0.97      0.97      0.97       236

[Parallel(n_jobs=12)]: Done 400 out of 400 | elapsed:    0.2s finished
print(classification_report(model.predict(trans_testX_df), test_y))
              precision    recall  f1-score   support

           0       0.73      0.47      0.57        17
           1       0.70      0.88      0.78        24

    accuracy                           0.71        41
   macro avg       0.71      0.67      0.67        41
weighted avg       0.71      0.71      0.69        41

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 400 out of 400 | elapsed:    0.0s finished
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix (model, trans_testX_df, test_y, cmap=plt.cm.Blues, display_labels=['normals', 'alpha carriers'], ax=ax)
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 400 out of 400 | elapsed:    0.0s finished
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x20d3bbe2388>
from xgboost import XGBClassifier as xgb
xgbmodel = xgb()
xgbmodel.fit(X_train_res, y_train_res)
XGBClassifier()
print(classification_report(xgbmodel.predict(X_train_res), y_train_res))
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       117
           1       1.00      0.99      1.00       119

    accuracy                           1.00       236
   macro avg       1.00      1.00      1.00       236
weighted avg       1.00      1.00      1.00       236

print(classification_report(xgbmodel.predict(trans_testX_df), test_y))
              precision    recall  f1-score   support

           0       0.64      0.54      0.58        13
           1       0.80      0.86      0.83        28

    accuracy                           0.76        41
   macro avg       0.72      0.70      0.71        41
weighted avg       0.75      0.76      0.75        41

param_grid = [
{'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25], 'max_features': [0.5, 'sqrt', 'log2'], 'min_samples_leaf':[4], 
 'n_jobs':[-1]},
{'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_features': [0.5, 'sqrt', 'log2'],  
 'n_jobs':[-1]}, 
   ]

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5,
scoring='accuracy',
return_train_score=True)
grid_search.fit(trans_trainX_df, train_y)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# define model
model2 = RandomForestClassifier()
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model2, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))