#Importing the relevant modules
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
#Load modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import RocCurveDisplay, roc_auc_score
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings("ignore")

Feature selection notebook¶

Based on https://www.kaggle.com/code/kanncaa1/feature-selection-and-data-visualization

Extra links:

https://www.kaggle.com/code/alicanakkus/decision-tree-classification

https://www.kaggle.com/code/aymenkhouja/introduction-to-feature-selection

df = pd.read_csv('data.csv')
df

useless_features = ['diagnosis', 'Unnamed: 32', 'id']
X = df.drop(useless_features, axis=1)
y = df.diagnosis

Data exploration¶

.info

.drop

.describe

divide data into malign and benign and analyze separately. Also plot both characteristics

create dummy for M and B

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
 32  Unnamed: 32              0 non-null      float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB

df.describe()

df.diagnosis.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

ax = sns.countplot(y, label="count")
B, M = y.value_counts()

M = df[df["diagnosis"]=="M"]
B = df[df["diagnosis"]=="B"]

M.describe()

B.describe()

Feature exploration¶

Comparing two features only:

# concavity_wors vs concave points_worst
sns.jointplot(X.loc[:,'concavity_worst'], X.loc[:,'concave points_worst'], kind="reg", color="#ce1414")

<seaborn.axisgrid.JointGrid at 0x7fdfd138cd60>

Comparing more features:

sns.set(style="white")
df = X.loc[:,['radius_worst','perimeter_worst','area_worst']] # choosing columns (features) to represent
g = sns.PairGrid(df, diag_sharey=False) # creating a PairGrid object
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3)

<seaborn.axisgrid.PairGrid at 0x7fdfd16aad00>

Swarm plot (copied from https://www.kaggle.com/code/kanncaa1/feature-selection-and-data-visualization):

sns.set(style="whitegrid", palette="muted")
data_dia = y # setting diagnosis (target) for representation
data = X # setting features for representation
data_n_2 = (data - data.mean()) / (data.std())              # standardization
data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1) # concatenating target and standardized features in a new df called data
# melt method
data = pd.melt(data,
               id_vars="diagnosis", # column 1
                var_name="features", # column 2
                value_name='value') # column 3

plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)

plt.xticks(rotation=90)
plt.show()

data = pd.concat([y,data_n_2.iloc[:,10:20]],axis=1) # concatenating target and standardized features in a new df called data
# melt method
data = pd.melt(data,
               id_vars="diagnosis", # column 1
                var_name="features", # column 2
                value_name='value') # column 3

plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)

plt.xticks(rotation=90)
plt.show()

data = pd.concat([y,data_n_2.iloc[:,20:31]],axis=1) # concatenating target and standardized features in a new df called data
# melt method
data = pd.melt(data,
               id_vars="diagnosis", # column 1
                var_name="features", # column 2
                value_name='value') # column 3

plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)

plt.xticks(rotation=90)
plt.show()

data.head() # showing the previously created df for clarification

Observing correlation between features by using a heatmap:

fig, ax = plt.subplots(figsize=(18,18))
sns.heatmap(X.corr(), annot=True, fmt='.1f', ax=ax)

<AxesSubplot:>

continue here!¶

Feature selection with CORRELATION and random forest classification¶

For correlation, we visually check the features that appear to be correlated with each other in the map heat figure. For example, radius_mean, perimeter_mean and area_mean are correlated, so we choose only area_mean to work with (we choose area_mean based on the swarm plots, but there is no correct answer). We do the same with the rest of the correlated features and we drop the ones that we don't need:

drop_list = ['perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se','perimeter_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','compactness_se','concave points_se','texture_worst','area_worst']
x_1 = X.drop(drop_list, axis = 1) # new df called x_1, without the useless features from the X df
x_1.head()

#correlation map
fig,ax = plt.subplots(figsize=(14, 14))
sns.heatmap(x_1.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

<AxesSubplot:>

Now we use random forest classification to test accuracy of the chosen features:

# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(x_1, y, test_size=0.3, random_state=42)

from sklearn.ensemble import RandomForestClassifier
#random forest classifier with n_estimators=10 (default)
clf_rf = RandomForestClassifier(random_state=43)      
clr_rf = clf_rf.fit(x_train,y_train)

ac = accuracy_score(y_test,clf_rf.predict(x_test))
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,clf_rf.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")

Accuracy is:  0.9649122807017544

<AxesSubplot:>

Univariate feature selection and random forest classification¶

In this method, we use SelectKBest which removes all but the k highest scoring features. We gotta choose the value of k, 5 in this case (this selection is intuitive):

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# find best scored 5 features
select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train)

print('Score list:', select_feature.scores_)
print('Feature list:', x_train.columns)

Score list: [6.06916433e+01 3.66899557e+04 1.00015175e-01 1.30547650e+01
 1.95982847e-01 3.42575072e-04 4.07131026e-02 6.12741067e+03
 1.32470372e-03 6.92896719e-01 1.39557806e-03 2.65927071e-03
 2.63226314e-01 2.58858117e+01 1.00635138e+00 1.23087347e-01]
Feature list: Index(['texture_mean', 'area_mean', 'smoothness_mean', 'concavity_mean',
       'symmetry_mean', 'fractal_dimension_mean', 'texture_se', 'area_se',
       'smoothness_se', 'concavity_se', 'symmetry_se', 'fractal_dimension_se',
       'smoothness_worst', 'concavity_worst', 'symmetry_worst',
       'fractal_dimension_worst'],
      dtype='object')

x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)
#random forest classifier with n_estimators=10 (default)
clf_rf_2 = RandomForestClassifier()      
clr_rf_2 = clf_rf_2.fit(x_train_2,y_train)
ac_2 = accuracy_score(y_test,clf_rf_2.predict(x_test_2))
print('Accuracy is: ',ac_2)
cm_2 = confusion_matrix(y_test,clf_rf_2.predict(x_test_2))
sns.heatmap(cm_2,annot=True,fmt="d")

Accuracy is:  0.9532163742690059

<AxesSubplot:>

Recursive feature elimination (RFE) with random forest classification¶

It uses random classification in this case to assign weights to each of the features. Whose weights are smallest are prined away from the set of features.

As before, we will use 5 features, which will be chosen thanks to the RFE method.

from sklearn.feature_selection import RFE
# Create the RFE object and rank each pixel
clf_rf_3 = RandomForestClassifier()      
rfe = RFE(estimator=clf_rf_3, n_features_to_select=5, step=1)
rfe = rfe.fit(x_train, y_train)

print('Chosen best 5 feature by rfe:',x_train.columns[rfe.support_])

Chosen best 5 feature by rfe: Index(['area_mean', 'concavity_mean', 'area_se', 'concavity_worst',
       'symmetry_worst'],
      dtype='object')

Now we know which 5 features to use, but why 5? Maybe it is better to use more or less features. We figure that out thaanks to the RFECV method explained below:

Recursive feature elimination with cross validation and random forest classification¶

from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct classifications
clf_rf_4 = RandomForestClassifier() 
rfecv = RFECV(estimator=clf_rf_4, step=1, cv=5,scoring='accuracy')   #5-fold cross-validation
rfecv = rfecv.fit(x_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train.columns[rfecv.support_])

Optimal number of features : 11
Best features : Index(['texture_mean', 'area_mean', 'smoothness_mean', 'concavity_mean',
       'area_se', 'smoothness_se', 'concavity_se', 'fractal_dimension_se',
       'smoothness_worst', 'concavity_worst', 'symmetry_worst'],
      dtype='object')

PCA for feature selection¶

# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#normalization
x_train_N = (x_train-x_train.mean())/(x_train.max()-x_train.min())
x_test_N = (x_test-x_test.mean())/(x_test.max()-x_test.min())

from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_train_N)

plt.figure(1, figsize=(7, 7))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_ratio_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_ratio_')

Text(0, 0.5, 'explained_variance_ratio_')

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	...	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	Unnamed: 32
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.30010	0.14710	...	17.33	184.60	2019.0	0.16220	0.66560	0.7119	0.2654	0.4601	0.11890	NaN
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.08690	0.07017	...	23.41	158.80	1956.0	0.12380	0.18660	0.2416	0.1860	0.2750	0.08902	NaN
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.19740	0.12790	...	25.53	152.50	1709.0	0.14440	0.42450	0.4504	0.2430	0.3613	0.08758	NaN
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.24140	0.10520	...	26.50	98.87	567.7	0.20980	0.86630	0.6869	0.2575	0.6638	0.17300	NaN
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.19800	0.10430	...	16.67	152.20	1575.0	0.13740	0.20500	0.4000	0.1625	0.2364	0.07678	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
564	926424	M	21.56	22.39	142.00	1479.0	0.11100	0.11590	0.24390	0.13890	...	26.40	166.10	2027.0	0.14100	0.21130	0.4107	0.2216	0.2060	0.07115	NaN
565	926682	M	20.13	28.25	131.20	1261.0	0.09780	0.10340	0.14400	0.09791	...	38.25	155.00	1731.0	0.11660	0.19220	0.3215	0.1628	0.2572	0.06637	NaN
566	926954	M	16.60	28.08	108.30	858.1	0.08455	0.10230	0.09251	0.05302	...	34.12	126.70	1124.0	0.11390	0.30940	0.3403	0.1418	0.2218	0.07820	NaN
567	927241	M	20.60	29.33	140.10	1265.0	0.11780	0.27700	0.35140	0.15200	...	39.42	184.60	1821.0	0.16500	0.86810	0.9387	0.2650	0.4087	0.12400	NaN
568	92751	B	7.76	24.54	47.92	181.0	0.05263	0.04362	0.00000	0.00000	...	30.37	59.16	268.6	0.08996	0.06444	0.0000	0.0000	0.2871	0.07039	NaN

	id	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	...	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	Unnamed: 32
count	5.690000e+02	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	...	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	0.0
mean	3.037183e+07	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	...	25.677223	107.261213	880.583128	0.132369	0.254265	0.272188	0.114606	0.290076	0.083946	NaN
std	1.250206e+08	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	...	6.146258	33.602542	569.356993	0.022832	0.157336	0.208624	0.065732	0.061867	0.018061	NaN
min	8.670000e+03	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	...	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156500	0.055040	NaN
25%	8.692180e+05	11.700000	16.170000	75.170000	420.300000	0.086370	0.064920	0.029560	0.020310	0.161900	...	21.080000	84.110000	515.300000	0.116600	0.147200	0.114500	0.064930	0.250400	0.071460	NaN
50%	9.060240e+05	13.370000	18.840000	86.240000	551.100000	0.095870	0.092630	0.061540	0.033500	0.179200	...	25.410000	97.660000	686.500000	0.131300	0.211900	0.226700	0.099930	0.282200	0.080040	NaN
75%	8.813129e+06	15.780000	21.800000	104.100000	782.700000	0.105300	0.130400	0.130700	0.074000	0.195700	...	29.720000	125.400000	1084.000000	0.146000	0.339100	0.382900	0.161400	0.317900	0.092080	NaN
max	9.113205e+08	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	...	49.540000	251.200000	4254.000000	0.222600	1.058000	1.252000	0.291000	0.663800	0.207500	NaN

	id	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	...	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	Unnamed: 32
count	2.120000e+02	212.000000	212.000000	212.000000	212.000000	212.000000	212.000000	212.000000	212.000000	212.000000	...	212.000000	212.000000	212.000000	212.000000	212.000000	212.000000	212.000000	212.000000	212.000000	0.0
mean	3.681805e+07	17.462830	21.604906	115.365377	978.376415	0.102898	0.145188	0.160775	0.087990	0.192909	...	29.318208	141.370330	1422.286321	0.144845	0.374824	0.450606	0.182237	0.323468	0.091530	NaN
std	1.378965e+08	3.203971	3.779470	21.854653	367.937978	0.012608	0.053987	0.075019	0.034374	0.027638	...	5.434804	29.457055	597.967743	0.021870	0.170372	0.181507	0.046308	0.074685	0.021553	NaN
min	8.670000e+03	10.950000	10.380000	71.900000	361.600000	0.073710	0.046050	0.023980	0.020310	0.130800	...	16.670000	85.100000	508.100000	0.088220	0.051310	0.023980	0.028990	0.156500	0.055040	NaN
25%	8.613450e+05	15.075000	19.327500	98.745000	705.300000	0.094010	0.109600	0.109525	0.064620	0.174050	...	25.782500	119.325000	970.300000	0.130475	0.244475	0.326425	0.152750	0.276500	0.076302	NaN
50%	8.953665e+05	17.325000	21.460000	114.200000	932.000000	0.102200	0.132350	0.151350	0.086280	0.189900	...	28.945000	138.000000	1303.000000	0.143450	0.356350	0.404900	0.182000	0.310300	0.087600	NaN
75%	8.911290e+06	19.590000	23.765000	129.925000	1203.750000	0.110925	0.172400	0.203050	0.103175	0.209850	...	32.690000	159.800000	1712.750000	0.155975	0.447850	0.556175	0.210675	0.359225	0.102625	NaN
max	9.112962e+08	28.110000	39.280000	188.500000	2501.000000	0.144700	0.345400	0.426800	0.201200	0.304000	...	49.540000	251.200000	4254.000000	0.222600	1.058000	1.170000	0.291000	0.663800	0.207500	NaN

	id	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	...	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	Unnamed: 32
count	3.570000e+02	357.000000	357.000000	357.000000	357.000000	357.000000	357.000000	357.000000	357.000000	357.000000	...	357.000000	357.000000	357.000000	357.000000	357.000000	357.000000	357.000000	357.000000	357.000000	0.0
mean	2.654382e+07	12.146524	17.914762	78.075406	462.790196	0.092478	0.080085	0.046058	0.025717	0.174186	...	23.515070	87.005938	558.899440	0.124959	0.182673	0.166238	0.074444	0.270246	0.079442	NaN
std	1.167397e+08	1.780512	3.995125	11.807438	134.287118	0.013446	0.033750	0.043442	0.015909	0.024807	...	5.493955	13.527091	163.601424	0.020013	0.092180	0.140368	0.035797	0.041745	0.013804	NaN
min	8.913000e+03	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	...	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156600	0.055210	NaN
25%	8.746620e+05	11.080000	15.150000	70.870000	378.200000	0.083060	0.055620	0.020310	0.015020	0.158000	...	19.580000	78.270000	447.100000	0.110400	0.112000	0.077080	0.051040	0.240600	0.070090	NaN
50%	9.089160e+05	12.200000	17.390000	78.180000	458.400000	0.090760	0.075290	0.037090	0.023440	0.171400	...	22.820000	86.920000	547.400000	0.125400	0.169800	0.141200	0.074310	0.268700	0.077120	NaN
75%	8.812816e+06	13.370000	19.760000	86.100000	551.100000	0.100700	0.097550	0.059990	0.032510	0.189000	...	26.510000	96.590000	670.000000	0.137600	0.230200	0.221600	0.097490	0.298300	0.085410	NaN
max	9.113205e+08	17.850000	33.810000	114.600000	992.100000	0.163400	0.223900	0.410800	0.085340	0.274300	...	41.780000	127.100000	1210.000000	0.200600	0.584900	1.252000	0.175000	0.422800	0.148600	NaN

	texture_mean	area_mean	smoothness_mean	concavity_mean	symmetry_mean	fractal_dimension_mean	texture_se	area_se	smoothness_se	concavity_se	symmetry_se	fractal_dimension_se	smoothness_worst	concavity_worst	symmetry_worst	fractal_dimension_worst
0	10.38	1001.0	0.11840	0.3001	0.2419	0.07871	0.9053	153.40	0.006399	0.05373	0.03003	0.006193	0.1622	0.7119	0.4601	0.11890
1	17.77	1326.0	0.08474	0.0869	0.1812	0.05667	0.7339	74.08	0.005225	0.01860	0.01389	0.003532	0.1238	0.2416	0.2750	0.08902
2	21.25	1203.0	0.10960	0.1974	0.2069	0.05999	0.7869	94.03	0.006150	0.03832	0.02250	0.004571	0.1444	0.4504	0.3613	0.08758
3	20.38	386.1	0.14250	0.2414	0.2597	0.09744	1.1560	27.23	0.009110	0.05661	0.05963	0.009208	0.2098	0.6869	0.6638	0.17300
4	14.34	1297.0	0.10030	0.1980	0.1809	0.05883	0.7813	94.44	0.011490	0.05688	0.01756	0.005115	0.1374	0.4000	0.2364	0.07678

	diagnosis	features	value
0	M	radius_mean	1.096100
1	M	radius_mean	1.828212
2	M	radius_mean	1.578499
3	M	radius_mean	-0.768233
4	M	radius_mean	1.748758