#Importing the relevant modules
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
#Load modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import RocCurveDisplay, roc_auc_score
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore")
Based on https://www.kaggle.com/code/kanncaa1/feature-selection-and-data-visualization
Extra links:
df = pd.read_csv('data.csv')
df
useless_features = ['diagnosis', 'Unnamed: 32', 'id']
X = df.drop(useless_features, axis=1)
y = df.diagnosis
.info
.drop
.describe
divide data into malign and benign and analyze separately. Also plot both characteristics
create dummy for M and B
df.info()
df.describe()
df.diagnosis.value_counts()
ax = sns.countplot(y, label="count")
B, M = y.value_counts()
M = df[df["diagnosis"]=="M"]
B = df[df["diagnosis"]=="B"]
M.describe()
B.describe()
Comparing two features only:
# concavity_wors vs concave points_worst
sns.jointplot(X.loc[:,'concavity_worst'], X.loc[:,'concave points_worst'], kind="reg", color="#ce1414")
Comparing more features:
sns.set(style="white")
df = X.loc[:,['radius_worst','perimeter_worst','area_worst']] # choosing columns (features) to represent
g = sns.PairGrid(df, diag_sharey=False) # creating a PairGrid object
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(plt.scatter)
g.map_diag(sns.kdeplot, lw=3)
Swarm plot (copied from https://www.kaggle.com/code/kanncaa1/feature-selection-and-data-visualization):
sns.set(style="whitegrid", palette="muted")
data_dia = y # setting diagnosis (target) for representation
data = X # setting features for representation
data_n_2 = (data - data.mean()) / (data.std()) # standardization
data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1) # concatenating target and standardized features in a new df called data
# melt method
data = pd.melt(data,
id_vars="diagnosis", # column 1
var_name="features", # column 2
value_name='value') # column 3
plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)
plt.xticks(rotation=90)
plt.show()
data = pd.concat([y,data_n_2.iloc[:,10:20]],axis=1) # concatenating target and standardized features in a new df called data
# melt method
data = pd.melt(data,
id_vars="diagnosis", # column 1
var_name="features", # column 2
value_name='value') # column 3
plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)
plt.xticks(rotation=90)
plt.show()
data = pd.concat([y,data_n_2.iloc[:,20:31]],axis=1) # concatenating target and standardized features in a new df called data
# melt method
data = pd.melt(data,
id_vars="diagnosis", # column 1
var_name="features", # column 2
value_name='value') # column 3
plt.figure(figsize=(10,10))
sns.swarmplot(x="features", y="value", hue="diagnosis", data=data)
plt.xticks(rotation=90)
plt.show()
data.head() # showing the previously created df for clarification
Observing correlation between features by using a heatmap:
fig, ax = plt.subplots(figsize=(18,18))
sns.heatmap(X.corr(), annot=True, fmt='.1f', ax=ax)
For correlation, we visually check the features that appear to be correlated with each other in the map heat figure. For example, radius_mean, perimeter_mean and area_mean are correlated, so we choose only area_mean to work with (we choose area_mean based on the swarm plots, but there is no correct answer). We do the same with the rest of the correlated features and we drop the ones that we don't need:
drop_list = ['perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se','perimeter_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','compactness_se','concave points_se','texture_worst','area_worst']
x_1 = X.drop(drop_list, axis = 1) # new df called x_1, without the useless features from the X df
x_1.head()
#correlation map
fig,ax = plt.subplots(figsize=(14, 14))
sns.heatmap(x_1.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
Now we use random forest classification to test accuracy of the chosen features:
# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(x_1, y, test_size=0.3, random_state=42)
from sklearn.ensemble import RandomForestClassifier
#random forest classifier with n_estimators=10 (default)
clf_rf = RandomForestClassifier(random_state=43)
clr_rf = clf_rf.fit(x_train,y_train)
ac = accuracy_score(y_test,clf_rf.predict(x_test))
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,clf_rf.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")
In this method, we use SelectKBest which removes all but the k highest scoring features. We gotta choose the value of k, 5 in this case (this selection is intuitive):
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# find best scored 5 features
select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train)
print('Score list:', select_feature.scores_)
print('Feature list:', x_train.columns)
x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)
#random forest classifier with n_estimators=10 (default)
clf_rf_2 = RandomForestClassifier()
clr_rf_2 = clf_rf_2.fit(x_train_2,y_train)
ac_2 = accuracy_score(y_test,clf_rf_2.predict(x_test_2))
print('Accuracy is: ',ac_2)
cm_2 = confusion_matrix(y_test,clf_rf_2.predict(x_test_2))
sns.heatmap(cm_2,annot=True,fmt="d")
It uses random classification in this case to assign weights to each of the features. Whose weights are smallest are prined away from the set of features.
As before, we will use 5 features, which will be chosen thanks to the RFE method.
from sklearn.feature_selection import RFE
# Create the RFE object and rank each pixel
clf_rf_3 = RandomForestClassifier()
rfe = RFE(estimator=clf_rf_3, n_features_to_select=5, step=1)
rfe = rfe.fit(x_train, y_train)
print('Chosen best 5 feature by rfe:',x_train.columns[rfe.support_])
Now we know which 5 features to use, but why 5? Maybe it is better to use more or less features. We figure that out thaanks to the RFECV method explained below:
from sklearn.feature_selection import RFECV
# The "accuracy" scoring is proportional to the number of correct classifications
clf_rf_4 = RandomForestClassifier()
rfecv = RFECV(estimator=clf_rf_4, step=1, cv=5,scoring='accuracy') #5-fold cross-validation
rfecv = rfecv.fit(x_train, y_train)
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train.columns[rfecv.support_])
# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#normalization
x_train_N = (x_train-x_train.mean())/(x_train.max()-x_train.min())
x_test_N = (x_test-x_test.mean())/(x_test.max()-x_test.min())
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_train_N)
plt.figure(1, figsize=(7, 7))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_ratio_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_ratio_')