Full code for building the ml models

Code for building the machine learning models:

Full Code of All Data

1. Write a problem statement

2. Problem Description

3. Import the reuired libraries like

## Import the necessary modules

## To read and manipulate the data/dataframe

import os

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

## For Data Preprocessing

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder

## For Modelling

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.svm import SVC, SVR

## Evaluation metrics

from sklearn.metrics import confusion_matrix, classification_report,accuracy_score, f1_score,recall_score

4. Read the dataset

## Read the dataset

data=pd.read_csv("Filename",na_values="?")

5. Check the dimensions of the data like rows and columns

## Check the dimenstions

print('Dataset has ' +str(data.shape[0]) + ' rows, and ' +str(data.shape[1]) + ' columns')

6. Check the datatype of each variable

## Check the datatype of each variable

data.dtypes

7. Exploratory Data Analysis

## Check the top 5 observations

data.head()

## Check the last 5 observations

data.tail()

## Check the basic summary statistics of the data

data.describe()

## Check the number of unique levels in each attribute.

data.nuinque()

8. Target attribute distribution

## Check for value counts in target variable

data.target.value_counts()

## Check for distribution of values in target variable.

data.target.value_counts(normalize=True)*100

## Countplot for target variable.

sns.countplot(train_df['Disposition'],data=train_df)

## Box plot for Numerical Columns.

sns.boxplot(x='Claim Type',y='Claim Amount',data=train_df)

## Countplot for Meaningfull variable.

sns.countplot(train_df['City'],data=train_df)

9. Data Pre-Processing

## Drop the colummns which are not significant

data.drop(["ID"], axis = 1, inplace=True)

## Check the shape of he data

data.shape

## Identify the Categorical Columns and store them in a variable cat_cols and numerical into num_cols

num_cols=[]

cat_cols=[]

## Convert all the categorical columns to appropriate data type

data[cat_cols]=data[cat_cols].astype('category)

## Check the data types

data.types

# Split the data into X and y

X=data.drop(['target'],axis=1)

y=data['target']

## Print the shape of X and y

print(X.shape,y.shape)

## Split the data into X_train, X_test, y_train, y_test with test_size = 0.20

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123,stratify=y)

## Shape of all X_train, y_train, X_test, y_test

print(X_train.shape)

print(X_test.shape)

print(y_train.shape)

print(y_test.shape)

10. Target attribute distribution

## Check for distribution of target in y_train

print(y_train.value_counts())

print(y_train.value_counts(normalize=True)*100)

## Check for distribution of target values in y_test

print(y_test.value_counts(normalize=True)*100)

11. Handling Missing Data

## Check the null values in train

X_train.isna().sum()

## Check the null values in test

X_test.isna().sum()

12. Missing Value Imputation

## Impute the Categorical columns with "mode"

si_cat = SimpleImputer(strategy='most_frequent')

si_cat.fit(X_train[cat_cols])

X_train_cat = pd.DataFrame(si_cat.transform(X_train[cat_cols]), columns=cat_cols)

X_test_cat = pd.DataFrame(si_cat.transform(X_test[cat_cols]), columns=cat_cols)

## Impute the Numerical columns with "median"

si_num = SimpleImputer(strategy='mean')

si_num.fit(X_train[num_cols])

X_train_num = pd.DataFrame(si_num.transform(X_train[num_cols]), columns=num_cols)

X_test_num = pd.DataFrame(si_num.transform(X_test[num_cols]), columns=num_cols)

13. Standardization for Numerical Columns

ss = StandardScaler()

ss.fit(X_train[num_cols])

X_train_std=ss.transform(X_train[num_cols])

X_test_std=ss.transform(X_test[num_cols)

14. OneHotEncoding for Categoriacl Columns

ohe = OneHotEncoder()

ohe.fit(X_train[cat_col_train])

X_train_ohe = ohe.transform(X_train[cat_col_train]).toarray()

X_test_ohe = ohe.transform(X_test[cat_col_train]).toarray()

print(X_train_ohe.shape)

print(X_test_ohe.shape)

15. Label Encdoing

## LABEL ENCODING FOR TARGET COLUMN

le=LabelEncoder()

le.fit(y_train)

y_train_le=le.transform(y_train)

y_test_le=le.transform(y_test)

y_train_le

y_test_le

16. Concatination

## Concatenating train,test- standardscalar, train, test-onehotencoder

X_train_con=np.concatenate([X_train_std,X_train_ohe],axis=1)

X_test_con=np.concatenate([X_test_std,X_test_ohe],axis=1)

print(X_train_con.shape)

print(X_test_con.shape)

17. Model Building

## For Logistic Regression

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(class_weight='balanced')

log_reg.fit(X_train_con,y_train_le)

y_pred_test_log = log_reg.predict(X_test_con)

y_pred_train_log= log_reg.predict(X_train_con)

cm = confusion_matrix(y_train_le,y_pred_train_log)

print(classification_report(y_train_le,y_pred_train_log))

print(classification_report(y_test_le,y_pred_test_log))

## For DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth =3, random_state = 123)

clf.fit(X_train_con, y_train_le)

y_pred_train_DT=clf.predict(X_train_con)

y_pred_test_DT=clf.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_DT))

print(classification_report(y_test_le,y_pred_test_DT))

## For KNeighber Classifier

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train_con, y_train_le)

y_pred_train_knn=knn.predict(X_train_con)

y_pred_test_knn=knn.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_knn))

print(classification_report(y_test_le,y_pred_test_knn))

## For Naive Bayes Classifier

from sklearn.naive_bayes import GaussianNB

nbc = GaussianNB()

nbc.fit(X_train_con, y_train_le)

y_pred_train_nbc = nbc.predict(X_train_con)

y_pred_test_nbc = nbc.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_nbc))

print(classification_report(y_test_le,y_pred_test_nbc))

## For Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

rfc.fit(X_train_con, y_train_le)

y_pred_train_rfc = rfc.predict(X_train_con)

y_pred_test_rfc = rfc.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_rfc))

print(classification_report(y_test_le,y_pred_test_rfc))

Learn Data Science

Search This Blog

Full code for building the ml models

Code for building the machine learning models:

Comments

Post a Comment