import pandas as pd
import sklearn
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from scipy.stats import randint
from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt
import xgboost
from xgboost import XGBClassifier
from xgboost import plot_importance
import seaborn as sns
import numpy as np

%matplotlib inline     
sns.set(color_codes=True)

# Load the dataset
df = pd.read_csv('mldata_0003167526.csv', engine='python', index_col=0)

# Display the first few rows of the dataset
df.head()

# Describe the dataframe
df.describe()

# Check content and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1100 entries, 0 to 1099
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   label       1100 non-null   int64  
 1   feature_1   1100 non-null   float64
 2   feature_2   1100 non-null   float64
 3   feature_3   1100 non-null   float64
 4   feature_4   1100 non-null   float64
 5   feature_5   1100 non-null   float64
 6   feature_6   1100 non-null   float64
 7   feature_7   1100 non-null   float64
 8   feature_8   1100 non-null   float64
 9   feature_9   1100 non-null   float64
 10  feature_10  1100 non-null   float64
 11  feature_11  1100 non-null   float64
 12  feature_12  1100 non-null   float64
 13  feature_13  1100 non-null   float64
 14  feature_14  1100 non-null   float64
 15  feature_15  1100 non-null   float64
 16  feature_16  1100 non-null   float64
 17  feature_17  1100 non-null   float64
 18  feature_18  1100 non-null   float64
 19  feature_19  1100 non-null   float64
 20  feature_20  1100 non-null   float64
 21  feature_21  1100 non-null   float64
 22  feature_22  1100 non-null   float64
 23  feature_23  1100 non-null   float64
 24  feature_24  1100 non-null   float64
 25  feature_25  1100 non-null   float64
 26  feature_26  1100 non-null   float64
 27  feature_27  1100 non-null   float64
 28  feature_28  1100 non-null   float64
 29  feature_29  1100 non-null   float64
 30  feature_30  1100 non-null   float64
dtypes: float64(30), int64(1)
memory usage: 275.0 KB

# Check if there are any missing values
df.isnull().any().any()

False

# Separate features and labels
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

X

y

0       1
1       0
2       1
3       1
4       1
       ..
1095    0
1096    1
1097    1
1098    1
1099    1
Name: label, Length: 1100, dtype: int64

df['label'].value_counts(normalize=True)

0    0.500909
1    0.499091
Name: label, dtype: float64

X.hist(bins=30, figsize=(20,15))
plt.show()

# Get the mean of each feature 
feature_means = X.mean()
print(feature_means)

feature_1    -0.672008
feature_2     0.091041
feature_3    -0.537824
feature_4    -0.278470
feature_5     0.109553
feature_6     0.541802
feature_7     0.090083
feature_8     0.099735
feature_9     0.087648
feature_10    0.076066
feature_11    0.115036
feature_12    0.109672
feature_13   -0.601236
feature_14    0.070106
feature_15    0.130011
feature_16    0.112100
feature_17    0.135944
feature_18    0.114952
feature_19   -0.339674
feature_20    0.082764
feature_21    0.099789
feature_22    0.570143
feature_23    0.136376
feature_24   -0.414618
feature_25    0.064678
feature_26    0.610409
feature_27    0.541802
feature_28    0.125500
feature_29    0.533639
feature_30    0.158889
dtype: float64

# Flatten the DataFrame to a single series and compute the mean
overall_mean = X.values.flatten().mean()
print(overall_mean)

0.06546349266092971

from sklearn.preprocessing import StandardScaler

scaler = sklearn.preprocessing.StandardScaler().fit(X)

X_scal= pd.DataFrame(scaler.transform(X), columns = X.columns)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scal, X_test_scal, y_train_scal, y_test_scal = train_test_split(X_scal, y, test_size=0.25, random_state=42)

# Plot histograms
X_scal.hist(bins=30, figsize=(20,15))
plt.show()

sns.displot(X_scal.iloc[:,:30], kde=True)

<seaborn.axisgrid.FacetGrid at 0x7fa2ca80c430>

correlation = X_train.corr()

plt.figure(figsize=(16, 12))
sns.heatmap(correlation, annot=True, fmt='.2f', annot_kws={'size': 6})

<Axes: >

mask = np.abs(correlation) < 0.30
filtered_corr = correlation.mask(mask)

plt.figure(figsize=(16, 12))
sns.heatmap(filtered_corr, annot=True, fmt='.2f', annot_kws={'size': 6})

<Axes: >

X = X.drop("feature_27", axis=1)
X_scal = X_scal.drop("feature_27", axis=1)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_scal, X_test_scal, y_train_scal, y_test_scal = train_test_split(X_scal, y, test_size=0.25, random_state=42)

pca = PCA(n_components=29)
fitted = pca.fit_transform(X_train)
components = pca.components_
cumsum_pca = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(16, 4))
plt.plot(cumsum_pca, linewidth=3)
plt.xlabel("Components")
plt.ylabel("Explained Variance")
plt.title("Explained Variance vs Number of Principal Components")
plt.grid(True)
plt.show()
    
var = pca.explained_variance_ratio_[0:29]
labels = ["PC"+str(i+1) for i in range(29)]
plt.figure(figsize=(16,4))
plt.bar(labels, var)
plt.xlabel('Pricipal Components')
plt.ylabel('Proportion of Variance Explained')
plt.show()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
pca = PCA()
X_pca = pca.fit_transform(X_train_scaled)
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_95 = len(total_explained_variance[total_explained_variance >= .95])
n_to_reach_95 = X.shape[1] - n_over_95 + 1
print('Number features: {}\tTotal Variance Explained: {}'.format(n_to_reach_95, total_explained_variance[n_to_reach_95-1]))

Number features: 25	Total Variance Explained: 0.9590994466683971

# Create lists of indices of the data with labels 0 and 1 respectively
l0 = df[df['label'] == 0].index
l1 = df[df['label'] == 1].index

# Apply PCA
pca2D = PCA(n_components=2)
data_2D = pca2D.fit_transform(X_scal)

# Plotting
fig, ax = plt.subplots(figsize=(8,8))
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2D plot after PCA', fontsize = 20)

# Specify the labels for the classes
classes = [0, 1]
colors = ['b', 'r']

# Plot each class separately
for label, color in zip(classes, colors):
    indices_to_keep = df['label'] == label
    ax.scatter(data_2D[indices_to_keep, 0], 
               data_2D[indices_to_keep, 1],
               c = color, 
               alpha = 0.3)
    
ax.legend(classes)
ax.grid()
plt.show()

# Apply PCA
pca3D = PCA(n_components = 3)
data_3D = pca3D.fit_transform(X_scal)

# Plotting
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_zlabel('Principal Component 3', fontsize = 15)
ax.set_title('3D plot after PCA', fontsize = 20)

# Specify the labels for the classes
classes = [0, 1]
colors = ['b', 'r']

# Plot each class separately
for label, color in zip(classes, colors):
    indices_to_keep = df['label'] == label
    ax.scatter(data_3D[indices_to_keep, 0], 
               data_3D[indices_to_keep, 1],
               data_3D[indices_to_keep, 2],
               c = color, 
               alpha = 0.3)
    
ax.legend(classes)
plt.show()

from sklearn.manifold import TSNE

# Fit and transform with t-SNE
model = TSNE(n_components=2, random_state=42)
embedded = model.fit_transform(X_scal)

# Prepare the data for visualization
df_tsne = pd.DataFrame(data = embedded, columns = ['Component 1', 'Component 2'])
df_tsne = pd.concat([df_tsne, df[['label']]], axis = 1)

# Visualize the data
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Component 1', fontsize = 15)
ax.set_ylabel('Component 2', fontsize = 15)
ax.set_title('2D Visualization of data after t-SNE', fontsize = 20)

classes = [0, 1]
colors = ['b', 'r']
for label, color in zip(classes,colors):
    indicesToKeep = df_tsne['label'] == label
    ax.scatter(df_tsne.loc[indicesToKeep, 'Component 1'], 
               df_tsne.loc[indicesToKeep, 'Component 2'], 
               c = color,
               alpha = 0.3)
ax.legend(classes)
ax.grid()

from mpl_toolkits.mplot3d import Axes3D

# Fit and transform with t-SNE
model = TSNE(n_components=3, random_state=42)
embedded = model.fit_transform(X)

# Prepare the data for visualization
df_tsne = pd.DataFrame(data = embedded, columns = ['Component 1', 'Component 2','Component 3'])
df_tsne = pd.concat([df_tsne, df[['label']]], axis = 1)

# Visualize the data
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(111, projection='3d') 
ax.set_xlabel('Component 1', fontsize = 15)
ax.set_ylabel('Component 2', fontsize = 15)
ax.set_zlabel('Component 3', fontsize = 15)
ax.set_title('3D Visualization of data after t-SNE', fontsize = 20)

classes = [0, 1]
colors = ['b', 'r']
for label, color in zip(classes,colors):
    indicesToKeep = df_tsne['label'] == label
    ax.scatter(df_tsne.loc[indicesToKeep, 'Component 1'], 
               df_tsne.loc[indicesToKeep, 'Component 2'],
               df_tsne.loc[indicesToKeep, 'Component 3'],
               c = color,
               alpha = 0.3)
ax.legend(classes)
plt.show()

kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_scal)
plt.scatter(X_scal.iloc[:,0], X_scal.iloc[:,1], c=kmeans.labels_, cmap='viridis')
plt.show()

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

df['cluster'] = kmeans.labels_

# We generate the confusion matrix for the predicted and true labels
cm = confusion_matrix(df['cluster'],y)

# The linear_sum_assignment function can be used to permute the labels
rows, cols = linear_sum_assignment(cm, maximize=True)
predicted_labels = cols[y]

# Now, we can compute the metrics
print('Confusion matrix:', cm)
print('Accuracy:', accuracy_score(df['cluster'],y))
print('Precision:', precision_score(df['cluster'],y))

Confusion matrix: [[256 339]
 [295 210]]
Accuracy: 0.42363636363636364
Precision: 0.3825136612021858

# Apply Spectral Clustering
spectral_model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors')
labels_spectral = spectral_model.fit_predict(X_scal)

# Add labels to the dataframe
df['cluster'] = labels_spectral

# We generate the confusion matrix for the predicted and true labels
cm = confusion_matrix(df['cluster'],y)

# The linear_sum_assignment function can be used to permute the labels
rows, cols = linear_sum_assignment(cm, maximize=True)
predicted_labels = cols[y]

# Now, we can compute the metrics
print('Confusion matrix:', cm)
print('Accuracy:', accuracy_score(df['cluster'],y))
print('Precision:', precision_score(df['cluster'],y))

Confusion matrix: [[241 242]
 [310 307]]
Accuracy: 0.49818181818181817
Precision: 0.5591985428051002

plt.scatter(X_scal.iloc[:,0], X_scal.iloc[:,1], c=labels_spectral, cmap='viridis')
plt.show()

# Initialize classifier
base_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Fit on training data
base_rf.fit(X_train, y_train)

# Predict on test data
y_pred = base_rf.predict(X_test)

# Print accuracy
print(f'Baseline accuracy: {accuracy_score(y_pred, y_test)}')

Baseline accuracy: 0.9136363636363637

# Fit on rescaled training data
base_rf.fit(X_train_scal,y_train_scal)

# Predict on rescaled test data
y_pred = base_rf.predict(X_test_scal)

# Print accuracy
print(f'Baseline accuracy: {accuracy_score(y_pred, y_test_scal)}')

Baseline accuracy: 0.8872727272727273

# Define the parameter distributions for random search
param_dist = {
    'max_depth': [None],
    'criterion': ['gini'],
    'n_estimators': [525, 550, 575],
    'max_features': ['sqrt'],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1,2],
    'bootstrap': [True]
}

# Initialize a RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, cv=4, scoring='accuracy', n_iter=15, random_state=42, verbose=2, n_jobs=-1)

# Fit the RandomizedSearchCV object to the data
random_search.fit(X_train, y_train)

# Get the parameters of the best model
best_params = random_search.best_params_

# Print the best parameters
print("Best parameters: ", best_params)

# Train and evaluate the model with the best parameters
nopca_rf = RandomForestClassifier(**best_params, random_state = 42)
nopca_rf.fit(X_train, y_train)

Fitting 4 folds for each of 15 candidates, totalling 60 fits
Best parameters:  {'n_estimators': 575, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'gini', 'bootstrap': True}

RandomForestClassifier(n_estimators=575, random_state=42)

RandomForestClassifier(n_estimators=575, random_state=42)

# Make predictions on the test data
y_pred = nopca_rf.predict(X_test)

# Print the classification report
print("Classification report:\n", classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Print the accuracy score
print("Accuracy score: ", accuracy_score(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.94      0.89      0.92       114
           1       0.89      0.94      0.92       106

    accuracy                           0.92       220
   macro avg       0.92      0.92      0.92       220
weighted avg       0.92      0.92      0.92       220

Confusion matrix:
 [[102  12]
 [  6 100]]
Accuracy score:  0.9181818181818182

fig, ax = plt.subplots(figsize=(80,40))
plot_tree(nopca_rf.estimators_[0], feature_names = X_train.columns, class_names=['1', '0'], filled=True, ax=ax)

# Get feature importance from the Random Forest model
feature_importances = nopca_rf.feature_importances_

# Get corresponding feature names from the training data
feature_names = X_train.columns

# Create a dictionary that maps feature names to their importance scores
importance_dict = {"Varname": feature_names, "Importance": feature_importances}

# Convert the dictionary to a pandas DataFrame
imp_df = pd.DataFrame(importance_dict)

# Display the DataFrame
imp_df.sort_values(by="Importance", ascending=False)

# Plotting the results of Feature Importance
fig = plt.figure(figsize=(8, 6))
plt.plot(feature_importances)
plt.ylabel('Relative Importance')
plt.xlabel('Genes')
plt.title('Feature Importance using Random Forests')

Text(0.5, 1.0, 'Feature Importance using Random Forests')

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('rf', RandomForestClassifier())])

param_distributions = {
    'pca__n_components': [5, 10, 15, 20, 25, 28],   
    'rf__n_estimators': [575],          
    'rf__max_features': ['sqrt'], 
    'rf__max_depth' : [None, 1500],               
    'rf__criterion' :['gini'],
    'rf__min_samples_split': [2, 3, 4],
    'rf__min_samples_leaf': [1, 2],
    'rf__bootstrap': [True]
    }

random_search = RandomizedSearchCV(pipe, param_distributions=param_distributions, n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('rf', RandomForestClassifier())]),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'pca__n_components': [5, 10, 15, 20, 25,
                                                              28],
                                        'rf__bootstrap': [True],
                                        'rf__criterion': ['gini'],
                                        'rf__max_depth': [None, 1500],
                                        'rf__max_features': ['sqrt'],
                                        'rf__min_samples_leaf': [1, 2],
                                        'rf__min_samples_split': [2, 3, 4],
                                        'rf__n_estimators': [575]},
                   random_state=42, verbose=2)

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('rf', RandomForestClassifier())]),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'pca__n_components': [5, 10, 15, 20, 25,
                                                              28],
                                        'rf__bootstrap': [True],
                                        'rf__criterion': ['gini'],
                                        'rf__max_depth': [None, 1500],
                                        'rf__max_features': ['sqrt'],
                                        'rf__min_samples_leaf': [1, 2],
                                        'rf__min_samples_split': [2, 3, 4],
                                        'rf__n_estimators': [575]},
                   random_state=42, verbose=2)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA()),
                ('rf', RandomForestClassifier())])

StandardScaler()

PCA()

RandomForestClassifier()

best_params = random_search.best_params_
print(best_params)

{'rf__n_estimators': 575, 'rf__min_samples_split': 3, 'rf__min_samples_leaf': 2, 'rf__max_features': 'sqrt', 'rf__max_depth': 1500, 'rf__criterion': 'gini', 'rf__bootstrap': True, 'pca__n_components': 28}

best_params_adjusted = {key.replace('rf__', ''): value for key, value in best_params.items() if 'rf__' in key}

# Train and evaluate the model with the best parameters
pca_rf = RandomForestClassifier(**best_params_adjusted, random_state = 42)
pca_rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=1500, min_samples_leaf=2, min_samples_split=3,
                       n_estimators=575, random_state=42)

RandomForestClassifier(max_depth=1500, min_samples_leaf=2, min_samples_split=3,
                       n_estimators=575, random_state=42)

y_pred = pca_rf.predict(X_test)

# Print the classification report
print("Classification report:\n", classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Print the accuracy score
print("Accuracy score: ", accuracy_score(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.94      0.88      0.91       114
           1       0.88      0.94      0.91       106

    accuracy                           0.91       220
   macro avg       0.91      0.91      0.91       220
weighted avg       0.91      0.91      0.91       220

Confusion matrix:
 [[100  14]
 [  6 100]]
Accuracy score:  0.9090909090909091

importances = pca_rf.feature_importances_
features = X_train.columns
indices = np.argsort(importances)

plt.figure(figsize=(10,10))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)))
plt.xlabel('Relative Importance')
plt.show()

import random
# Tuning hyperparameters and storing model
estimator = XGBClassifier(random_state=42, verbose=3)

# Define the parameter grid
params = {
    'max_depth': randint(10,100),
    'min_child_weight': [1,2],
    'learning_rate': [0.001,0.01, 0.1, 0.5],
    'n_estimators': randint(100,1000),
    'colsample_bynode': [0.1, 0.25, 0.5, 0.75, 1],
    'subsample': [0.5, 0.75, 1]
    }

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=estimator, param_distributions=params, cv=5, scoring='accuracy', verbose=3, n_jobs=-1, n_iter=100)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

# Get the best score
best_score = random_search.best_score_
print(f"Best score: {best_score}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[10:34:26] WARNING: ../src/learner.cc:767: 
Parameters: { "verbose" } are not used.

Best parameters: {'colsample_bynode': 0.75, 'learning_rate': 0.01, 'max_depth': 18, 'min_child_weight': 1, 'n_estimators': 737, 'subsample': 0.75}
Best score: 0.8965909090909092

# Train and evaluate the model with the best parameters
nopca_xgb = XGBClassifier(**best_params)
nopca_xgb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = nopca_xgb.predict(X_test)

# Print the classification report
print("Classification report:\n", classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Print the accuracy score
print("Accuracy score: ", accuracy_score(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.94      0.87      0.90       114
           1       0.87      0.94      0.90       106

    accuracy                           0.90       220
   macro avg       0.91      0.91      0.90       220
weighted avg       0.91      0.90      0.90       220

Confusion matrix:
 [[ 99  15]
 [  6 100]]
Accuracy score:  0.9045454545454545

# Get feature importances
importances = nopca_xgb.feature_importances_

# Convert the importances into one-dimensional 1darray with corresponding df column names as axis labels
f_importances = pd.Series(importances, X.columns)

# Sort the array in descending order of the importances
f_importances.sort_values(ascending=False, inplace=True)

# Make the bar Plot from f_importances 
f_importances[:].plot(x='Features', y='Importance', kind='bar', figsize=(16,9), rot=45)
plt.tight_layout()
plt.title("Most Important Features")
plt.show()

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('clf', XGBClassifier())
])

# Parameter grid for both PCA and XGBClassifier
param_grid = {
    'pca__n_components': [15, 20, 28],
    'clf__n_estimators': [100, 200, 400],  
    'clf__learning_rate': [0.05, 0.002],
    'clf__max_depth': [10,20,50],
    'clf__min_child_weight': [1, 2],
    'clf__colsample_bynode': [0.2, 0.4],
    'clf__subsample': [0.9, 1]
}

# Perform randomized search
random_search = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter = 100, n_jobs=-1, verbose=3, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")

# Get the best score
best_score = random_search.best_score_
print(f"Best score: {best_score}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters: {'pca__n_components': 28, 'clf__subsample': 1, 'clf__n_estimators': 400, 'clf__min_child_weight': 1, 'clf__max_depth': 10, 'clf__learning_rate': 0.05, 'clf__colsample_bynode': 0.2}
Best score: 0.8965909090909092

best_params_adjusted = {key.replace('clf__', ''): value for key, value in best_params.items() if 'clf__' in key}

# Train and evaluate the model with the best parameters
nopca_xgb = XGBClassifier(**best_params_adjusted)
nopca_xgb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = nopca_xgb.predict(X_test)

# Print the classification report
print("Classification report:\n", classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Print the accuracy score
print("Accuracy score: ", accuracy_score(y_test, y_pred))

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.85      0.90       114
           1       0.86      0.96      0.91       106

    accuracy                           0.90       220
   macro avg       0.91      0.91      0.90       220
weighted avg       0.91      0.90      0.90       220

Confusion matrix:
 [[ 97  17]
 [  4 102]]
Accuracy score:  0.9045454545454545

# Generate component names
component_names = ["Component " + str(i+1) for i in range(best_params['pca__n_components'])]

# Select the importances corresponding to the selected components
importances_selected = importances[:best_params['pca__n_components']]

# Convert the importances into a series with corresponding component names as labels
f_importances = pd.Series(importances_selected, component_names)

# Sort the array in descending order of the importances
f_importances.sort_values(ascending=False, inplace=True)

# Make the bar plot from f_importances
f_importances.plot(kind='bar', figsize=(16, 9), rot=45)
plt.tight_layout()
plt.title("Component Importances")
plt.show()

df_valid = pd.read_csv("mldata_0003167526.TEST_FEATURES.csv", engine='python', index_col=0)
df_valid

df_valid = df_valid.drop("feature_27", axis=1)

y_pred = nopca_rf.predict(df_valid)
y_pred

array([0, 0, 0, ..., 1, 1, 0])

unique_values, frequencies = np.unique(y_pred, return_counts=True)

for value, count in zip(unique_values, frequencies):
    print(f"Number: {value}, Frequency: {count}")

Number: 0, Frequency: 548
Number: 1, Frequency: 552

print(np.array2string(y_pred, threshold=np.inf))

[0 0 0 1 0 1 1 0 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1
 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 1 1 0 1 0 1 1 0 1 0 0 1 1 1 1
 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 0 1 0 0 0 1 1 0 1 0 0 1 1 1 1 1 1 1 0 1 0 1
 0 1 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 1 0 1 0 0 0 0 0 0 1 1 0 1 0 1 0 1 1 1 1
 0 1 0 1 1 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 1 1 1 0 0 1
 0 0 0 0 1 1 1 0 0 1 1 0 0 1 0 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 0 0 1 1 1 0 0 1 1 1 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 0 1 1 0 1 0 1
 1 1 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 0 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 0 1 0 1 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0 1 1 1 1 1 1 0 1
 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 0 0 0
 1 0 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 1 0 1 1 1 1 1 1
 1 1 0 0 0 1 1 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 1 0
 1 0 1 1 1 0 1 0 1 0 1 1 0 1 0 0 1 1 1 1 1 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0
 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0
 1 1 1 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0
 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 0 1 1 1 0 0 0 0 1 1 0 0
 1 1 0 0 1 1 1 0 0 1 1 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 0 0 1 1 1
 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 1 0 1 0 1 1 1 0 1 0 1 0 1
 1 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0 1 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 1 1 0 1
 0 1 0 1 0 0 1 1 1 1 1 1 1 0 0 1 0 0 1 1 1 0 1 0 1 1 0 0 1 1 0 0 1 1 0 1 1
 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1
 1 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 1 0 0 1 0 0 0 1 1 1 0 0 0 0 1 1 0 1 1 0 0
 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 0
 0 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0
 1 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 1 1 0 1 0 1 0 0 0 1 1 1 0 0 1 1 0
 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 0 0 0 1 0 0 0 1 1 0 1
 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 0 0 0
 0 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 0 0 1 1 0 0 1 0 0 1 0 0 1 1 1 1 0 1 1 1
 0 1 0 1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 0 0 0 0
 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1 0]

np.savetxt('test_predictions.txt', y_pred, fmt="%.0f")

pip install nbconvert

Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting nbconvert
  Downloading nbconvert-7.7.3-py3-none-any.whl (254 kB)
Collecting jupyterlab-pygments
  Downloading jupyterlab_pygments-0.2.2-py2.py3-none-any.whl (21 kB)
Collecting nbclient>=0.5.0
  Downloading nbclient-0.8.0-py3-none-any.whl (73 kB)
Collecting bleach!=5.0.0
  Downloading bleach-6.0.0-py3-none-any.whl (162 kB)
Requirement already satisfied: traitlets>=5.1 in c:\users\diego\appdata\roaming\python\python39\site-packages (from nbconvert) (5.9.0)
Collecting defusedxml
  Downloading defusedxml-0.7.1-py2.py3-none-any.whl (25 kB)
Collecting markupsafe>=2.0
  Downloading MarkupSafe-2.1.3-cp39-cp39-win_amd64.whl (17 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
Requirement already satisfied: packaging in c:\users\diego\appdata\roaming\python\python39\site-packages (from nbconvert) (23.1)
Requirement already satisfied: jupyter-core>=4.7 in c:\users\diego\appdata\roaming\python\python39\site-packages (from nbconvert) (5.3.1)
Collecting tinycss2
  Downloading tinycss2-1.2.1-py3-none-any.whl (21 kB)
Requirement already satisfied: importlib-metadata>=3.6 in c:\users\diego\appdata\roaming\python\python39\site-packages (from nbconvert) (6.8.0)
Collecting mistune<4,>=2.0.3
  Downloading mistune-3.0.1-py3-none-any.whl (47 kB)
Collecting pandocfilters>=1.4.1
  Downloading pandocfilters-1.5.0-py2.py3-none-any.whl (8.7 kB)
Collecting jinja2>=3.0
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
Requirement already satisfied: pygments>=2.4.1 in c:\users\diego\appdata\roaming\python\python39\site-packages (from nbconvert) (2.15.1)
Collecting nbformat>=5.7
  Downloading nbformat-5.9.1-py3-none-any.whl (77 kB)
Requirement already satisfied: six>=1.9.0 in c:\users\diego\appdata\roaming\python\python39\site-packages (from bleach!=5.0.0->nbconvert) (1.16.0)
Collecting webencodings
  Downloading webencodings-0.5.1-py2.py3-none-any.whl (11 kB)
Requirement already satisfied: zipp>=0.5 in c:\users\diego\appdata\roaming\python\python39\site-packages (from importlib-metadata>=3.6->nbconvert) (3.16.2)
Requirement already satisfied: platformdirs>=2.5 in c:\users\diego\appdata\roaming\python\python39\site-packages (from jupyter-core>=4.7->nbconvert) (3.9.1)
Requirement already satisfied: pywin32>=300 in c:\users\diego\appdata\roaming\python\python39\site-packages (from jupyter-core>=4.7->nbconvert) (306)
Requirement already satisfied: jupyter-client>=6.1.12 in c:\users\diego\appdata\roaming\python\python39\site-packages (from nbclient>=0.5.0->nbconvert) (8.3.0)
Requirement already satisfied: pyzmq>=23.0 in c:\users\diego\appdata\roaming\python\python39\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (25.1.0)
Requirement already satisfied: tornado>=6.2 in c:\users\diego\appdata\roaming\python\python39\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (6.3.2)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\diego\appdata\roaming\python\python39\site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (2.8.2)
Collecting fastjsonschema
  Downloading fastjsonschema-2.18.0-py3-none-any.whl (23 kB)
Collecting jsonschema>=2.6
  Downloading jsonschema-4.18.4-py3-none-any.whl (80 kB)
Collecting attrs>=22.2.0
  Downloading attrs-23.1.0-py3-none-any.whl (61 kB)
Collecting jsonschema-specifications>=2023.03.6
  Downloading jsonschema_specifications-2023.7.1-py3-none-any.whl (17 kB)
Collecting referencing>=0.28.4
  Downloading referencing-0.30.0-py3-none-any.whl (25 kB)
Collecting rpds-py>=0.7.1
  Downloading rpds_py-0.9.2-cp39-none-win_amd64.whl (180 kB)
Collecting soupsieve>1.2
  Downloading soupsieve-2.4.1-py3-none-any.whl (36 kB)
Installing collected packages: rpds-py, attrs, referencing, jsonschema-specifications, jsonschema, fastjsonschema, webencodings, soupsieve, nbformat, markupsafe, tinycss2, pandocfilters, nbclient, mistune, jupyterlab-pygments, jinja2, defusedxml, bleach, beautifulsoup4, nbconvert
Successfully installed attrs-23.1.0 beautifulsoup4-4.12.2 bleach-6.0.0 defusedxml-0.7.1 fastjsonschema-2.18.0 jinja2-3.1.2 jsonschema-4.18.4 jsonschema-specifications-2023.7.1 jupyterlab-pygments-0.2.2 markupsafe-2.1.3 mistune-3.0.1 nbclient-0.8.0 nbconvert-7.7.3 nbformat-5.9.1 pandocfilters-1.5.0 referencing-0.30.0 rpds-py-0.9.2 soupsieve-2.4.1 tinycss2-1.2.1 webencodings-0.5.1

WARNING: You are using pip version 21.2.3; however, version 23.2.1 is available.
You should consider upgrading via the 'c:\Program Files (x86)\Microsoft Visual Studio\Shared\Python39_64\python.exe -m pip install --upgrade pip' command.

%%shell
jupyter nbconvert --to html /content/HCC1806_smart_report.ipynb

UsageError: Cell magic `%%shell` not found.

	label	feature_1	feature_2	feature_3	feature_4	feature_5	feature_6	feature_7	feature_8	feature_9	...	feature_21	feature_22	feature_23	feature_24	feature_25	feature_26	feature_27	feature_28	feature_29	feature_30
0	1	4.057270	1.951432	-2.014668	-0.642949	0.650687	-3.166441	0.675066	-0.024935	-0.244799	...	0.879803	-0.144238	1.975087	-3.180973	0.502609	1.551031	-3.166441	-0.747862	1.554594	0.736315
1	0	3.867975	1.428813	1.978360	-0.111222	2.052394	0.395007	0.824279	-0.274835	0.356668	...	2.253963	-0.441242	0.884054	-2.120698	0.767681	1.990281	0.395007	0.708836	0.957221	-1.270081
2	1	2.980818	0.261838	-3.313828	-6.191561	1.140616	0.943470	-0.797617	-0.565640	0.380764	...	1.010830	1.501111	0.457695	-0.845371	-0.074576	1.825136	0.943470	-0.373409	1.732898	-1.616465
3	1	-15.612333	-0.178304	-4.713225	1.012077	-0.728142	-0.350978	1.997204	-0.221699	0.899836	...	-0.191173	-3.649844	-0.232722	-1.653924	-0.053498	2.999686	-0.350978	-0.350323	-4.144130	1.291735
4	1	0.987185	-0.475980	-4.189851	-3.362096	1.594640	-5.039295	-0.645427	0.101655	0.464569	...	0.116223	-1.668353	-0.080317	1.841277	1.362457	0.438424	-5.039295	-0.225977	0.985768	-2.583778

	label	feature_1	feature_2	feature_3	feature_4	feature_5	feature_6	feature_7	feature_8	feature_9	...	feature_21	feature_22	feature_23	feature_24	feature_25	feature_26	feature_27	feature_28	feature_29	feature_30
count	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	...	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000	1100.000000
mean	0.499091	-0.672008	0.091041	-0.537824	-0.278470	0.109553	0.541802	0.090083	0.099735	0.087648	...	0.099789	0.570143	0.136376	-0.414618	0.064678	0.610409	0.541802	0.125500	0.533639	0.158889
std	0.500227	4.481494	1.011377	2.209936	2.109117	1.005550	2.441415	0.990888	1.023479	0.987395	...	1.024063	2.345155	0.999228	2.267628	0.994216	2.194797	2.441415	0.997895	2.203774	2.094577
min	0.000000	-15.655886	-2.733398	-6.897339	-9.068785	-2.717827	-7.064519	-2.939766	-2.834590	-3.008175	...	-3.262510	-6.296130	-3.066167	-9.140800	-3.135697	-5.847707	-7.064519	-3.205089	-6.372749	-7.240100
25%	0.000000	-3.472009	-0.570993	-1.941254	-1.666970	-0.576431	-1.022132	-0.587099	-0.597444	-0.620232	...	-0.590890	-1.086519	-0.550595	-2.025955	-0.546712	-0.807645	-1.022132	-0.504176	-0.899168	-1.195587
50%	0.000000	-0.708800	0.069533	-0.573898	-0.447134	0.116504	0.649191	0.111437	0.092746	0.080980	...	0.066514	0.566573	0.120195	-0.457113	0.030411	0.586553	0.649191	0.103476	0.592584	0.151943
75%	1.000000	2.326327	0.719795	0.959192	1.015581	0.802281	2.173787	0.742773	0.774012	0.759486	...	0.813773	2.240195	0.840255	1.039940	0.780491	1.901821	2.173787	0.820564	1.980756	1.615480
max	1.000000	13.270171	3.389469	6.370923	7.501342	3.125541	7.275244	3.289488	3.169168	2.994109	...	3.340135	8.357498	3.646904	6.713087	3.131615	8.269118	7.275244	3.621707	8.333575	6.386495

	feature_1	feature_2	feature_3	feature_4	feature_5	feature_6	feature_7	feature_8	feature_9	feature_10	...	feature_21	feature_22	feature_23	feature_24	feature_25	feature_26	feature_27	feature_28	feature_29	feature_30
0	4.057270	1.951432	-2.014668	-0.642949	0.650687	-3.166441	0.675066	-0.024935	-0.244799	1.099651	...	0.879803	-0.144238	1.975087	-3.180973	0.502609	1.551031	-3.166441	-0.747862	1.554594	0.736315
1	3.867975	1.428813	1.978360	-0.111222	2.052394	0.395007	0.824279	-0.274835	0.356668	-0.214423	...	2.253963	-0.441242	0.884054	-2.120698	0.767681	1.990281	0.395007	0.708836	0.957221	-1.270081
2	2.980818	0.261838	-3.313828	-6.191561	1.140616	0.943470	-0.797617	-0.565640	0.380764	1.595929	...	1.010830	1.501111	0.457695	-0.845371	-0.074576	1.825136	0.943470	-0.373409	1.732898	-1.616465
3	-15.612333	-0.178304	-4.713225	1.012077	-0.728142	-0.350978	1.997204	-0.221699	0.899836	1.032810	...	-0.191173	-3.649844	-0.232722	-1.653924	-0.053498	2.999686	-0.350978	-0.350323	-4.144130	1.291735
4	0.987185	-0.475980	-4.189851	-3.362096	1.594640	-5.039295	-0.645427	0.101655	0.464569	1.007361	...	0.116223	-1.668353	-0.080317	1.841277	1.362457	0.438424	-5.039295	-0.225977	0.985768	-2.583778
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1095	-0.486212	-0.755237	1.496743	-0.966680	-0.604068	1.385061	-1.097147	-0.480347	0.247602	0.164477	...	-0.244126	-0.926585	0.883256	-1.476773	-0.319275	0.976549	1.385061	1.045007	0.492219	1.704167
1096	-0.162522	0.766001	-2.277737	3.072013	0.743206	-0.273250	-0.132270	-0.186075	-0.030593	0.631806	...	-0.457899	1.125114	1.055859	0.493116	0.098920	-1.877514	-0.273250	-0.923706	-0.437961	-0.530500
1097	-12.316811	-0.913487	-2.676490	-1.717323	0.428345	0.905332	1.127144	-0.765455	-0.061206	-2.943096	...	0.516651	-0.499196	-0.882315	2.148700	-0.001280	-1.127778	0.905332	-1.627599	-4.502604	0.401528
1098	-7.206946	-0.479993	-2.920069	-2.568712	0.039729	0.307855	2.759019	2.148201	2.335403	-0.197616	...	0.184319	4.321605	-0.879819	-1.527462	1.695145	3.285086	0.307855	-0.299364	-0.283931	-2.531881
1099	-15.655886	0.321223	-1.724252	1.227165	-0.179356	1.303814	-0.533283	1.273955	1.290380	0.562309	...	1.316120	-1.555775	0.230611	1.661996	-0.364953	0.999813	1.303814	0.060594	-4.547707	1.162129

	feature_1	feature_2	feature_3	feature_4	feature_5	feature_6	feature_7	feature_8	feature_9	feature_10	...	feature_21	feature_22	feature_23	feature_24	feature_25	feature_26	feature_27	feature_28	feature_29	feature_30
id
0	5.169152	-0.422545	-0.166745	-2.160405	-0.541668	1.535590	-0.883458	-0.898003	1.029121	-1.164223	...	0.081774	3.640189	0.126593	-2.753048	0.181393	-2.345920	1.535590	-1.830572	2.396734	1.641938
1	4.372384	2.106584	1.661014	-0.591197	-1.318891	0.440874	-1.890186	-0.539446	1.067210	-0.019939	...	1.605968	1.500657	-0.852812	-2.094574	0.111139	0.483712	0.440874	-0.903571	-0.721683	1.683998
2	-1.414379	-0.496314	0.921714	-1.385249	-0.993750	2.204314	0.827298	-0.483309	-1.317939	-0.584120	...	0.335746	2.644945	-1.005737	-4.504952	-0.374361	0.244277	2.204314	0.080471	0.324959	2.060959
3	-3.978457	-1.320719	-0.725139	-1.897857	1.409653	-1.259921	-0.445250	-2.360164	0.515266	1.181887	...	0.645354	0.205342	-0.312366	-2.892793	-0.650078	4.034582	-1.259921	1.668748	1.222397	1.759645
4	-4.583941	0.093019	0.691001	0.370484	1.765657	3.863709	0.445114	-0.489995	0.325797	1.215284	...	2.014121	-2.584151	0.224861	-0.229516	-0.205261	0.248366	3.863709	-0.664600	-0.351065	2.872313
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1095	-1.644332	0.370689	-4.082042	-2.806278	-0.739130	-0.303571	1.121895	-0.107321	-0.201497	0.054933	...	0.033633	-0.544219	-1.665967	-0.606563	-0.099780	2.368125	-0.303571	-0.605207	1.177338	1.779320
1096	-2.753484	-0.718869	0.166363	0.114012	-1.720009	2.818790	0.858724	-0.262194	-0.627256	-0.456870	...	0.115490	1.839830	-0.399719	-2.354364	0.554073	2.675105	2.818790	0.279674	3.836591	5.181900
1097	-0.657574	-1.834577	-1.073452	-2.571303	0.381434	3.320756	1.413899	0.740743	0.300032	-1.042624	...	0.446898	-1.517231	-0.081561	-1.106731	-0.867004	0.111508	3.320756	-0.365547	1.008189	-1.055714
1098	-6.986736	0.991637	-2.033800	-2.381164	0.392070	-0.939122	0.840815	0.805626	0.484091	0.604074	...	1.558575	0.966182	-1.426791	0.562497	-0.346274	0.084159	-0.939122	-2.753483	-2.483206	0.284520
1099	-1.765428	1.092728	2.187913	-1.109001	0.683123	0.640776	-1.047383	0.194471	-0.358661	0.861884	...	1.395711	1.767878	0.493274	4.737294	0.138322	-2.017459	0.640776	1.422159	-1.262667	1.681476

Introduction¶

Libraries¶

Loading the data¶

Separating the data¶

Data exploration¶

Correlation¶

Filter out correlated feature¶

Preliminary checks on PCA¶

t-SNE¶

Data clustering¶

K-means¶

Spectral clustering¶

Random forest¶

Without PCA¶

Baseline accuracy¶

Random forest parameters¶

With PCA¶

XGBoost (extra)¶

Without PCA¶

With PCA¶

Making predictions¶

Conclusion¶

	Varname	Importance
28	feature_30	0.147045
5	feature_6	0.091088
3	feature_4	0.071733
18	feature_19	0.057151
25	feature_26	0.056825
27	feature_29	0.055307
12	feature_13	0.046430
23	feature_24	0.041076
21	feature_22	0.040604
2	feature_3	0.039601
15	feature_16	0.038369
0	feature_1	0.026396
19	feature_20	0.025786
10	feature_11	0.025092
11	feature_12	0.018902
4	feature_5	0.017947
8	feature_9	0.017877
7	feature_8	0.017382
13	feature_14	0.016264
6	feature_7	0.016086
17	feature_18	0.015738
16	feature_17	0.015681
24	feature_25	0.015584
22	feature_23	0.015388
20	feature_21	0.014617
14	feature_15	0.014489
9	feature_10	0.014387
1	feature_2	0.014194
26	feature_28	0.012960