Skip to main content

Full code for building the ml models

Code for building the machine learning models:

Full Code of All Data

1. Write a problem statement

2. Problem Description

3. Import the reuired libraries like
## Import the necessary modules
## To read and manipulate the data/dataframe
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## For Data Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder

## For Modelling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, SVR

## Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score, f1_score,recall_score


4. Read the dataset 
## Read the dataset
data=pd.read_csv("Filename",na_values="?")

5. Check the dimensions of the data like rows and columns
## Check the dimenstions
print('Dataset has ' +str(data.shape[0]) + ' rows, and ' +str(data.shape[1]) + ' columns')

6. Check the datatype of each variable
## Check the datatype of each variable
data.dtypes

7. Exploratory Data Analysis
## Check the top 5 observations
data.head()
## Check the last 5 observations
data.tail()

## Check the basic summary statistics of the data
data.describe()

## Check the number of unique levels in each attribute.
data.nuinque()

8. Target attribute distribution


## Check for value counts in target variable
data.target.value_counts()

## Check for distribution of values in target variable.
data.target.value_counts(normalize=True)*100

## Countplot for target variable.
sns.countplot(train_df['Disposition'],data=train_df)

## Box plot for Numerical Columns.
sns.boxplot(x='Claim Type',y='Claim Amount',data=train_df)

## Countplot for Meaningfull variable.
sns.countplot(train_df['City'],data=train_df)

9. Data Pre-Processing

## Drop the colummns which are not significant
data.drop(["ID"], axis = 1, inplace=True)

## Check the shape of he data
data.shape

## Identify the Categorical Columns and store them in a variable cat_cols and numerical into num_cols
num_cols=[]
cat_cols=[]

## Convert all the categorical columns to appropriate data type
data[cat_cols]=data[cat_cols].astype('category)

## Check the data types 
data.types

# Split the data into X and y
X=data.drop(['target'],axis=1)
y=data['target']

## Print the shape of X and y
print(X.shape,y.shape)
 
## Split the data into X_train, X_test, y_train, y_test with test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123,stratify=y)

## Shape of all X_train, y_train, X_test, y_test
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
10. Target attribute distribution

## Check for distribution of target in y_train
print(y_train.value_counts())
print(y_train.value_counts(normalize=True)*100)
## Check for distribution of target values in y_test
print(y_test.value_counts(normalize=True)*100)

11. Handling Missing Data

## Check the null values in train
X_train.isna().sum()
## Check the null values in test
X_test.isna().sum()
12. Missing Value Imputation
## Impute the Categorical columns with "mode"
si_cat = SimpleImputer(strategy='most_frequent')
si_cat.fit(X_train[cat_cols])

X_train_cat = pd.DataFrame(si_cat.transform(X_train[cat_cols]), columns=cat_cols)
X_test_cat = pd.DataFrame(si_cat.transform(X_test[cat_cols]), columns=cat_cols)
## Impute the Numerical columns with "median"
si_num = SimpleImputer(strategy='mean')
si_num.fit(X_train[num_cols])

X_train_num = pd.DataFrame(si_num.transform(X_train[num_cols]), columns=num_cols)
X_test_num = pd.DataFrame(si_num.transform(X_test[num_cols]), columns=num_cols)


13. Standardization for Numerical Columns

ss = StandardScaler()
ss.fit(X_train[num_cols])
X_train_std=ss.transform(X_train[num_cols])
X_test_std=ss.transform(X_test[num_cols)

14. OneHotEncoding for Categoriacl Columns

ohe = OneHotEncoder()
ohe.fit(X_train[cat_col_train])

X_train_ohe = ohe.transform(X_train[cat_col_train]).toarray()
X_test_ohe = ohe.transform(X_test[cat_col_train]).toarray()

print(X_train_ohe.shape)
print(X_test_ohe.shape)

15.  Label Encdoing
  
## LABEL ENCODING FOR TARGET COLUMN
le=LabelEncoder()
le.fit(y_train) 
y_train_le=le.transform(y_train)
y_test_le=le.transform(y_test)
y_train_le
y_test_le

16. Concatination
## Concatenating train,test- standardscalar, train, test-onehotencoder
X_train_con=np.concatenate([X_train_std,X_train_ohe],axis=1)
X_test_con=np.concatenate([X_test_std,X_test_ohe],axis=1)

print(X_train_con.shape)
print(X_test_con.shape)

17. Model Building
## For Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(class_weight='balanced')
log_reg.fit(X_train_con,y_train_le)
 
y_pred_test_log = log_reg.predict(X_test_con)
y_pred_train_log= log_reg.predict(X_train_con)

cm = confusion_matrix(y_train_le,y_pred_train_log)
cm
print(classification_report(y_train_le,y_pred_train_log))
print(classification_report(y_test_le,y_pred_test_log))

  
  
## For DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth =3, random_state = 123)
clf.fit(X_train_con, y_train_le)

y_pred_train_DT=clf.predict(X_train_con)
y_pred_test_DT=clf.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_DT))
print(classification_report(y_test_le,y_pred_test_DT))

  
  
  ## For KNeighber Classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_con, y_train_le)

y_pred_train_knn=knn.predict(X_train_con)
y_pred_test_knn=knn.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_knn))
print(classification_report(y_test_le,y_pred_test_knn))
 
 

## For Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
  
nbc = GaussianNB()
        nbc.fit(X_train_con, y_train_le)

y_pred_train_nbc = nbc.predict(X_train_con)
y_pred_test_nbc  = nbc.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_nbc))
print(classification_report(y_test_le,y_pred_test_nbc))



## For Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier() 
rfc.fit(X_train_con, y_train_le)
y_pred_train_rfc = rfc.predict(X_train_con)
y_pred_test_rfc  = rfc.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_rfc))
print(classification_report(y_test_le,y_pred_test_rfc))

Comments