Overfitting in imbalanced dataset

Question

I am working on a dataset related to an insurance company and the objective is to predict if the insurance buyer will claim their travel insurance or not.
Training data: https://raw.githubusercontent.com/dphi-official/Datasets/master/travel_insurance/Training_set_label.csv
Testing data: https://raw.githubusercontent.com/dphi-official/Datasets/master/travel_insurance/Testing_set_label.csv
I have written the below code so far: When I submit the predictions on the website (it's similar to Kaggle), I get F1 score of 2.8850777368167977 while the 5 fold cross validation F1 score is around 74%. I read that in severely imbalanced datasets, there is not much smote can do if the variation within the minority classes is a lot. (Source) However, there are some entries on the website with F1 score of 12-15. Any suggestions on how can I improve the F1 score?
  #We will fill the missing values in Gender column with the mode value for the "Gender" column
  train_df['Gender'].fillna(train_df['Gender'].mode()[0],inplace=True)
  test_df['Gender'].fillna(test_df['Gender'].mode()[0],inplace=True)
    
  #"Duration" column has negative values. Replace it with median of "Duration" column as time cannot be -ve
  mask=train_df[train_df['Duration']<0]
  mask['Duration']=train_df['Duration'].median()
  train_df=pd.concat([train_df[~train_df['Duration']<0],mask])
    
  #Same idea for "Net Sales" and "Age" column. Repeat this for test_df
  ...

y=train_df['Claim']
 train_df.drop(['Claim'],inplace=True,axis=1)
 X=train_df
    
 # applying the transformations in cross validation

from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(5) 
valid_f1=[]
train_f1=[]
f1_global=0

X=np.array(X)
y=np.array(y)

for train_index,valid_index in cv.split(X,y):
  X_train,y_train= X[train_index],y[train_index]
  X_valid,y_valid= X[valid_index],y[valid_index]

#apply encoders
  encoder=ce.JamesSteinEncoder()

encoded_train=encoder.fit_transform(X_train,y_train)
  encoded_valid=encoder.transform(X_valid)

#apply PCA to get the required number of features
  pca = PCA(0.99)

reduced_encoded_train=pca.fit_transform(encoded_train)
  reduced_encoded_valid=pca.transform(encoded_valid)

n_features = pca.n_components_
  print("Number of features selected:",n_features)

#applying quantile transformation
  qt = QuantileTransformer(random_state=0)

train_num_transformed=qt.fit_transform(reduced_encoded_train)
  valid_num_transformed=qt.transform(reduced_encoded_valid)

#applying robust scaler
  rs=RobustScaler()

train_num_scaled=rs.fit_transform(train_num_transformed)
  valid_num_scaled=rs.transform(valid_num_transformed)

#apply SMOTE
  bsmen=BorderlineSMOTE('minority',random_state=12)

X_train_smen,y_train_smen = bsmen.fit_sample(train_num_scaled,y_train)
  
  #model using SVC
  svc=SVC()
  svc.fit(X_train_smen,y_train_smen)

#fine tuning Support Vector

#The strength of the regularization is inversely proportional to C.
  #reducing C to increase regularization and avoid overfitting
  
  tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4],
                       'C': [0.001, 0.010, 0.0001]
                       }, 
                      {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4],
                       'C': [0.001, 0.010, 0.0001]
                       },
                      {'kernel': ['linear'], 'C': [0.001, 0.010, 0.0001]
                       }
                      ]

tuned_SVC=BayesSearchCV(svc,tuned_parameters,cv=5,scoring='f1',random_state=12,n_jobs=-1)
  tuned_SVC.fit(X_train_smen,y_train_smen)

#validation dataset predicting 
  valid_predictions = tuned_SVC.predict(valid_num_scaled)

#training dataset predicting 
  train_predictions = tuned_SVC.predict(X_train_smen)

#calculating f1_score on validation predictions
  valid_f1_scor = f1_score(y_valid,valid_predictions)
  print("******")
  print("Support Vector:")
  print("Validation F1 score:",valid_f1_scor)

#calculating f1_score on training predictions
  train_f1_scor = f1_score(y_train_smen,train_predictions)

print("Training F1 score:",train_f1_scor)

#saving results of SVC as it is performing the best
  if valid_f1_scor>f1_global:
    # save the model to disk
    filename = 'finalized_model.sav'
    pickle.dump(svc, open(filename, 'wb'))

#save fitted encoders, transformers and scalers 
    filename = 'finalized_dim_reducer.sav'
    pickle.dump(pca, open(filename, 'wb'))

filename = 'finalized_encoder.sav'
    pickle.dump(encoder, open(filename, 'wb'))

filename = 'finalized_transformer.sav'
    pickle.dump(qt, open(filename, 'wb'))

filename = 'finalized_scaler.sav'
    pickle.dump(rs, open(filename, 'wb'))

f1_global=valid_f1_scor

Overfitting in imbalanced dataset

Add your own answers!

Ask a Question