In [18]:
#Importing the relevant modules
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
#Load modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import RocCurveDisplay, roc_auc_score
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
In [32]:
import warnings
warnings.filterwarnings("ignore")

Extra links:

In [2]:
df = pd.read_csv('data.csv')
df
Out[2]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 ... 17.33 184.60 2019.0 0.16220 0.66560 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 ... 23.41 158.80 1956.0 0.12380 0.18660 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 ... 25.53 152.50 1709.0 0.14440 0.42450 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 ... 26.50 98.87 567.7 0.20980 0.86630 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 ... 16.67 152.20 1575.0 0.13740 0.20500 0.4000 0.1625 0.2364 0.07678 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 926424 M 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 ... 26.40 166.10 2027.0 0.14100 0.21130 0.4107 0.2216 0.2060 0.07115 NaN
565 926682 M 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 ... 38.25 155.00 1731.0 0.11660 0.19220 0.3215 0.1628 0.2572 0.06637 NaN
566 926954 M 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 ... 34.12 126.70 1124.0 0.11390 0.30940 0.3403 0.1418 0.2218 0.07820 NaN
567 927241 M 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 ... 39.42 184.60 1821.0 0.16500 0.86810 0.9387 0.2650 0.4087 0.12400 NaN
568 92751 B 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 ... 30.37 59.16 268.6 0.08996 0.06444 0.0000 0.0000 0.2871 0.07039 NaN

569 rows × 33 columns

In [22]:
useless_features = ['diagnosis', 'Unnamed: 32', 'id']
X = df.drop(useless_features, axis=1)
y = df.diagnosis

Data exploration

.info

.drop

.describe

divide data into malign and benign and analyze separately. Also plot both characteristics

create dummy for M and B

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
 32  Unnamed: 32              0 non-null      float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB
In [4]:
df.describe()
Out[4]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 0.0
mean 3.037183e+07 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 ... 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946 NaN
std 1.250206e+08 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 ... 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061 NaN
min 8.670000e+03 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 ... 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040 NaN
25% 8.692180e+05 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 ... 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460 NaN
50% 9.060240e+05 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 ... 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040 NaN
75% 8.813129e+06 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 ... 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080 NaN
max 9.113205e+08 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 ... 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500 NaN

8 rows × 32 columns

In [5]:
df.diagnosis.value_counts()
Out[5]:
B    357
M    212
Name: diagnosis, dtype: int64
In [20]:
ax = sns.countplot(y, label="count")
B, M = y.value_counts()
In [6]:
M = df[df["diagnosis"]=="M"]
B = df[df["diagnosis"]=="B"]
In [7]:
M.describe()
Out[7]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
count 2.120000e+02 212.000000 212.000000 212.000000 212.000000 212.000000 212.000000 212.000000 212.000000 212.000000 ... 212.000000 212.000000 212.000000 212.000000 212.000000 212.000000 212.000000 212.000000 212.000000 0.0
mean 3.681805e+07 17.462830 21.604906 115.365377 978.376415 0.102898 0.145188 0.160775 0.087990 0.192909 ... 29.318208 141.370330 1422.286321 0.144845 0.374824 0.450606 0.182237 0.323468 0.091530 NaN
std 1.378965e+08 3.203971 3.779470 21.854653 367.937978 0.012608 0.053987 0.075019 0.034374 0.027638 ... 5.434804 29.457055 597.967743 0.021870 0.170372 0.181507 0.046308 0.074685 0.021553 NaN
min 8.670000e+03 10.950000 10.380000 71.900000 361.600000 0.073710 0.046050 0.023980 0.020310 0.130800 ... 16.670000 85.100000 508.100000 0.088220 0.051310 0.023980 0.028990 0.156500 0.055040 NaN
25% 8.613450e+05 15.075000 19.327500 98.745000 705.300000 0.094010 0.109600 0.109525 0.064620 0.174050 ... 25.782500 119.325000 970.300000 0.130475 0.244475 0.326425 0.152750 0.276500 0.076302 NaN
50% 8.953665e+05 17.325000 21.460000 114.200000 932.000000 0.102200 0.132350 0.151350 0.086280 0.189900 ... 28.945000 138.000000 1303.000000 0.143450 0.356350 0.404900 0.182000 0.310300 0.087600 NaN
75% 8.911290e+06 19.590000 23.765000 129.925000 1203.750000 0.110925 0.172400 0.203050 0.103175 0.209850 ... 32.690000 159.800000 1712.750000 0.155975 0.447850 0.556175 0.210675 0.359225 0.102625 NaN
max 9.112962e+08 28.110000 39.280000 188.500000 2501.000000 0.144700 0.345400 0.426800 0.201200 0.304000 ... 49.540000 251.200000 4254.000000 0.222600 1.058000 1.170000 0.291000 0.663800 0.207500 NaN

8 rows × 32 columns

In [8]:
B.describe()
Out[8]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
count 3.570000e+02 357.000000 357.000000 357.000000 357.000000 357.000000 357.000000 357.000000 357.000000 357.000000 ... 357.000000 357.000000 357.000000 357.000000 357.000000 357.000000 357.000000 357.000000 357.000000 0.0
mean 2.654382e+07 12.146524 17.914762 78.075406 462.790196 0.092478 0.080085 0.046058 0.025717 0.174186 ... 23.515070 87.005938 558.899440 0.124959 0.182673 0.166238 0.074444 0.270246 0.079442 NaN
std 1.167397e+08 1.780512 3.995125 11.807438 134.287118 0.013446 0.033750 0.043442 0.015909 0.024807 ... 5.493955 13.527091 163.601424 0.020013 0.092180 0.140368 0.035797 0.041745 0.013804 NaN
min 8.913000e+03 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 ... 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156600 0.055210 NaN
25% 8.746620e+05 11.080000 15.150000 70.870000 378.200000 0.083060 0.055620 0.020310 0.015020 0.158000 ... 19.580000 78.270000 447.100000 0.110400 0.112000 0.077080 0.051040 0.240600 0.070090 NaN
50% 9.089160e+05 12.200000 17.390000 78.180000 458.400000 0.090760 0.075290 0.037090 0.023440 0.171400 ... 22.820000 86.920000 547.400000 0.125400 0.169800 0.141200 0.074310 0.268700 0.077120 NaN
75% 8.812816e+06 13.370000 19.760000 86.100000 551.100000 0.100700 0.097550 0.059990 0.032510 0.189000 ... 26.510000 96.590000 670.000000 0.137600 0.230200 0.221600 0.097490 0.298300 0.085410 NaN
max 9.113205e+08 17.850000 33.810000 114.600000 992.100000 0.163400 0.223900 0.410800 0.085340 0.274300 ... 41.780000 127.100000 1210.000000 0.200600 0.584900 1.252000 0.175000 0.422800 0.148600 NaN

8 rows × 32 columns

Feature exploration

Comparing two features only:

In [27]:
# concavity_wors vs concave points_worst
sns.jointplot(X.loc[:,'concavity_worst'], X.loc[:,'concave points_worst'], kind="reg", color="#ce1414")
Out[27]:
<seaborn.axisgrid.JointGrid at 0x7fdfd138cd60>

Comparing more features:

In [29]:
sns.set(style="white")
df = X.loc[:,['radius_worst','perimeter_worst','area_worst']] # choosing columns (features) to represent
g = sns.PairGrid(df, diag_sharey=False) # creating a PairGrid object
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3)
Out[29]:
<seaborn.axisgrid.PairGrid at 0x7fdfd16aad00>
In [37]:
sns.set(style="whitegrid", palette="muted")
data_dia = y # setting diagnosis (target) for representation
data = X # setting features for representation
data_n_2 = (data - data.mean()) / (data.std())              # standardization
data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1) # concatenating target and standardized features in a new df called data
# melt method
data = pd.melt(data,
               id_vars="diagnosis", # column 1
                var_name="features", # column 2
                value_name='value') # column 3

plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)

plt.xticks(rotation=90)
plt.show()
In [42]:
data = pd.concat([y,data_n_2.iloc[:,10:20]],axis=1) # concatenating target and standardized features in a new df called data
# melt method
data = pd.melt(data,
               id_vars="diagnosis", # column 1
                var_name="features", # column 2
                value_name='value') # column 3

plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)

plt.xticks(rotation=90)
plt.show()
In [43]:
data = pd.concat([y,data_n_2.iloc[:,20:31]],axis=1) # concatenating target and standardized features in a new df called data
# melt method
data = pd.melt(data,
               id_vars="diagnosis", # column 1
                var_name="features", # column 2
                value_name='value') # column 3

plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)

plt.xticks(rotation=90)
plt.show()
In [38]:
data.head() # showing the previously created df for clarification
Out[38]:
diagnosis features value
0 M radius_mean 1.096100
1 M radius_mean 1.828212
2 M radius_mean 1.578499
3 M radius_mean -0.768233
4 M radius_mean 1.748758

Observing correlation between features by using a heatmap:

In [41]:
fig, ax = plt.subplots(figsize=(18,18))
sns.heatmap(X.corr(), annot=True, fmt='.1f', ax=ax)
Out[41]:
<AxesSubplot:>

continue here!

Feature selection with CORRELATION and random forest classification

For correlation, we visually check the features that appear to be correlated with each other in the map heat figure. For example, radius_mean, perimeter_mean and area_mean are correlated, so we choose only area_mean to work with (we choose area_mean based on the swarm plots, but there is no correct answer). We do the same with the rest of the correlated features and we drop the ones that we don't need:

In [45]:
drop_list = ['perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se','perimeter_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','compactness_se','concave points_se','texture_worst','area_worst']
x_1 = X.drop(drop_list, axis = 1) # new df called x_1, without the useless features from the X df
x_1.head()
Out[45]:
texture_mean area_mean smoothness_mean concavity_mean symmetry_mean fractal_dimension_mean texture_se area_se smoothness_se concavity_se symmetry_se fractal_dimension_se smoothness_worst concavity_worst symmetry_worst fractal_dimension_worst
0 10.38 1001.0 0.11840 0.3001 0.2419 0.07871 0.9053 153.40 0.006399 0.05373 0.03003 0.006193 0.1622 0.7119 0.4601 0.11890
1 17.77 1326.0 0.08474 0.0869 0.1812 0.05667 0.7339 74.08 0.005225 0.01860 0.01389 0.003532 0.1238 0.2416 0.2750 0.08902
2 21.25 1203.0 0.10960 0.1974 0.2069 0.05999 0.7869 94.03 0.006150 0.03832 0.02250 0.004571 0.1444 0.4504 0.3613 0.08758
3 20.38 386.1 0.14250 0.2414 0.2597 0.09744 1.1560 27.23 0.009110 0.05661 0.05963 0.009208 0.2098 0.6869 0.6638 0.17300
4 14.34 1297.0 0.10030 0.1980 0.1809 0.05883 0.7813 94.44 0.011490 0.05688 0.01756 0.005115 0.1374 0.4000 0.2364 0.07678
In [46]:
#correlation map
fig,ax = plt.subplots(figsize=(14, 14))
sns.heatmap(x_1.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
Out[46]:
<AxesSubplot:>

Now we use random forest classification to test accuracy of the chosen features:

In [51]:
# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(x_1, y, test_size=0.3, random_state=42)
In [52]:
from sklearn.ensemble import RandomForestClassifier
#random forest classifier with n_estimators=10 (default)
clf_rf = RandomForestClassifier(random_state=43)      
clr_rf = clf_rf.fit(x_train,y_train)

ac = accuracy_score(y_test,clf_rf.predict(x_test))
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,clf_rf.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")
Accuracy is:  0.9649122807017544
Out[52]:
<AxesSubplot:>

Univariate feature selection and random forest classification

In this method, we use SelectKBest which removes all but the k highest scoring features. We gotta choose the value of k, 5 in this case (this selection is intuitive):

In [53]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# find best scored 5 features
select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train)
In [54]:
print('Score list:', select_feature.scores_)
print('Feature list:', x_train.columns)
Score list: [6.06916433e+01 3.66899557e+04 1.00015175e-01 1.30547650e+01
 1.95982847e-01 3.42575072e-04 4.07131026e-02 6.12741067e+03
 1.32470372e-03 6.92896719e-01 1.39557806e-03 2.65927071e-03
 2.63226314e-01 2.58858117e+01 1.00635138e+00 1.23087347e-01]
Feature list: Index(['texture_mean', 'area_mean', 'smoothness_mean', 'concavity_mean',
       'symmetry_mean', 'fractal_dimension_mean', 'texture_se', 'area_se',
       'smoothness_se', 'concavity_se', 'symmetry_se', 'fractal_dimension_se',
       'smoothness_worst', 'concavity_worst', 'symmetry_worst',
       'fractal_dimension_worst'],
      dtype='object')
In [55]:
x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)
#random forest classifier with n_estimators=10 (default)
clf_rf_2 = RandomForestClassifier()      
clr_rf_2 = clf_rf_2.fit(x_train_2,y_train)
ac_2 = accuracy_score(y_test,clf_rf_2.predict(x_test_2))
print('Accuracy is: ',ac_2)
cm_2 = confusion_matrix(y_test,clf_rf_2.predict(x_test_2))
sns.heatmap(cm_2,annot=True,fmt="d")
Accuracy is:  0.9532163742690059
Out[55]:
<AxesSubplot:>

Recursive feature elimination (RFE) with random forest classification

It uses random classification in this case to assign weights to each of the features. Whose weights are smallest are prined away from the set of features.

As before, we will use 5 features, which will be chosen thanks to the RFE method.

In [56]:
from sklearn.feature_selection import RFE
# Create the RFE object and rank each pixel
clf_rf_3 = RandomForestClassifier()      
rfe = RFE(estimator=clf_rf_3, n_features_to_select=5, step=1)
rfe = rfe.fit(x_train, y_train)
In [57]:
print('Chosen best 5 feature by rfe:',x_train.columns[rfe.support_])
Chosen best 5 feature by rfe: Index(['area_mean', 'concavity_mean', 'area_se', 'concavity_worst',
       'symmetry_worst'],
      dtype='object')

Now we know which 5 features to use, but why 5? Maybe it is better to use more or less features. We figure that out thaanks to the RFECV method explained below:

Recursive feature elimination with cross validation and random forest classification

In [58]:
from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct classifications
clf_rf_4 = RandomForestClassifier() 
rfecv = RFECV(estimator=clf_rf_4, step=1, cv=5,scoring='accuracy')   #5-fold cross-validation
rfecv = rfecv.fit(x_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train.columns[rfecv.support_])
Optimal number of features : 11
Best features : Index(['texture_mean', 'area_mean', 'smoothness_mean', 'concavity_mean',
       'area_se', 'smoothness_se', 'concavity_se', 'fractal_dimension_se',
       'smoothness_worst', 'concavity_worst', 'symmetry_worst'],
      dtype='object')

PCA for feature selection

In [63]:
# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#normalization
x_train_N = (x_train-x_train.mean())/(x_train.max()-x_train.min())
x_test_N = (x_test-x_test.mean())/(x_test.max()-x_test.min())

from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_train_N)

plt.figure(1, figsize=(7, 7))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_ratio_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_ratio_')
Out[63]:
Text(0, 0.5, 'explained_variance_ratio_')