Code for building the machine learning models:
Full Code of All Data
1. Write a problem statement
2. Problem Description
3. Import the reuired libraries like
## Import the necessary modules
## To read and manipulate the data/dataframe
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
## For Data Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
## For Modelling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, SVR
## Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score, f1_score,recall_score
4. Read the dataset
## Read the dataset
data=pd.read_csv("Filename",na_values="?")
5. Check the dimensions of the data like rows and columns
## Check the dimenstions
print('Dataset has ' +str(data.shape[0]) + ' rows, and ' +str(data.shape[1]) + ' columns')
6. Check the datatype of each variable
## Check the datatype of each variable
data.dtypes
7. Exploratory Data Analysis
## Check the top 5 observations
data.head()
## Check the last 5 observations
data.tail()
## Check the basic summary statistics of the data
data.describe()
## Check the number of unique levels in each attribute.
data.nuinque()
8. Target attribute distribution
## Check for value counts in target variable
data.target.value_counts()
## Check for distribution of values in target variable.
data.target.value_counts(normalize=True)*100
## Countplot for target variable.
sns.countplot(train_df['Disposition'],data=train_df)
## Box plot for Numerical Columns.
sns.boxplot(x='Claim Type',y='Claim Amount',data=train_df)
## Countplot for Meaningfull variable.
sns.countplot(train_df['City'],data=train_df)
9. Data Pre-Processing
## Drop the colummns which are not significant
data.drop(["ID"], axis = 1, inplace=True)
## Check the shape of he data
data.shape
## Identify the Categorical Columns and store them in a variable cat_cols and numerical into num_cols
num_cols=[]
cat_cols=[]
## Convert all the categorical columns to appropriate data type
data[cat_cols]=data[cat_cols].astype('category)
## Check the data types
data.types
# Split the data into X and y
X=data.drop(['target'],axis=1)
y=data['target']
## Print the shape of X and y
print(X.shape,y.shape)
## Split the data into X_train, X_test, y_train, y_test with test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123,stratify=y)
## Shape of all X_train, y_train, X_test, y_test
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
10. Target attribute distribution
## Check for distribution of target in y_train
print(y_train.value_counts())
print(y_train.value_counts(normalize=True)*100)
## Check for distribution of target values in y_test
print(y_test.value_counts(normalize=True)*100)
11. Handling Missing Data
## Check the null values in train
X_train.isna().sum()
## Check the null values in test
X_test.isna().sum()
12. Missing Value Imputation
## Impute the Categorical columns with "mode"
si_cat = SimpleImputer(strategy='most_frequent')
si_cat.fit(X_train[cat_cols])
X_train_cat = pd.DataFrame(si_cat.transform(X_train[cat_cols]), columns=cat_cols)
X_test_cat = pd.DataFrame(si_cat.transform(X_test[cat_cols]), columns=cat_cols)
## Impute the Numerical columns with "median"
si_num = SimpleImputer(strategy='mean')
si_num.fit(X_train[num_cols])
X_train_num = pd.DataFrame(si_num.transform(X_train[num_cols]), columns=num_cols)
X_test_num = pd.DataFrame(si_num.transform(X_test[num_cols]), columns=num_cols)
13. Standardization for Numerical Columns
ss = StandardScaler()
ss.fit(X_train[num_cols])
X_train_std=ss.transform(X_train[num_cols])
X_test_std=ss.transform(X_test[num_cols)
14. OneHotEncoding for Categoriacl Columns
ohe = OneHotEncoder()
ohe.fit(X_train[cat_col_train])
X_train_ohe = ohe.transform(X_train[cat_col_train]).toarray()
X_test_ohe = ohe.transform(X_test[cat_col_train]).toarray()
print(X_train_ohe.shape)
print(X_test_ohe.shape)
15. Label Encdoing
## LABEL ENCODING FOR TARGET COLUMN
le=LabelEncoder()
le.fit(y_train)
y_train_le=le.transform(y_train)
y_test_le=le.transform(y_test)
y_train_le
y_test_le
16. Concatination
## Concatenating train,test- standardscalar, train, test-onehotencoder
X_train_con=np.concatenate([X_train_std,X_train_ohe],axis=1)
X_test_con=np.concatenate([X_test_std,X_test_ohe],axis=1)
print(X_train_con.shape)
print(X_test_con.shape)
17. Model Building
## For Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(class_weight='balanced')
log_reg.fit(X_train_con,y_train_le)
y_pred_test_log = log_reg.predict(X_test_con)
y_pred_train_log= log_reg.predict(X_train_con)
cm = confusion_matrix(y_train_le,y_pred_train_log)
cm
print(classification_report(y_train_le,y_pred_train_log))
print(classification_report(y_test_le,y_pred_test_log))
## For DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth =3, random_state = 123)
clf.fit(X_train_con, y_train_le)
y_pred_train_DT=clf.predict(X_train_con)
y_pred_test_DT=clf.predict(X_test_con)
print(classification_report(y_train_le,y_pred_train_DT))
print(classification_report(y_test_le,y_pred_test_DT))
## For KNeighber Classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_con, y_train_le)
y_pred_train_knn=knn.predict(X_train_con)
y_pred_test_knn=knn.predict(X_test_con)
print(classification_report(y_train_le,y_pred_train_knn))
print(classification_report(y_test_le,y_pred_test_knn))
## For Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
nbc = GaussianNB()
nbc.fit(X_train_con, y_train_le)
y_pred_train_nbc = nbc.predict(X_train_con)
y_pred_test_nbc = nbc.predict(X_test_con)
print(classification_report(y_train_le,y_pred_train_nbc))
print(classification_report(y_test_le,y_pred_test_nbc))
## For Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_con, y_train_le)
y_pred_train_rfc = rfc.predict(X_train_con)
y_pred_test_rfc = rfc.predict(X_test_con)
print(classification_report(y_train_le,y_pred_train_rfc))
print(classification_report(y_test_le,y_pred_test_rfc))
Comments
Post a Comment