Skip to main content

Full code for building the ml models

Code for building the machine learning models:

Full Code of All Data

1. Write a problem statement

2. Problem Description

3. Import the reuired libraries like
## Import the necessary modules
## To read and manipulate the data/dataframe
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## For Data Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder

## For Modelling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, SVR

## Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score, f1_score,recall_score


4. Read the dataset 
## Read the dataset
data=pd.read_csv("Filename",na_values="?")

5. Check the dimensions of the data like rows and columns
## Check the dimenstions
print('Dataset has ' +str(data.shape[0]) + ' rows, and ' +str(data.shape[1]) + ' columns')

6. Check the datatype of each variable
## Check the datatype of each variable
data.dtypes

7. Exploratory Data Analysis
## Check the top 5 observations
data.head()
## Check the last 5 observations
data.tail()

## Check the basic summary statistics of the data
data.describe()

## Check the number of unique levels in each attribute.
data.nuinque()

8. Target attribute distribution


## Check for value counts in target variable
data.target.value_counts()

## Check for distribution of values in target variable.
data.target.value_counts(normalize=True)*100

## Countplot for target variable.
sns.countplot(train_df['Disposition'],data=train_df)

## Box plot for Numerical Columns.
sns.boxplot(x='Claim Type',y='Claim Amount',data=train_df)

## Countplot for Meaningfull variable.
sns.countplot(train_df['City'],data=train_df)

9. Data Pre-Processing

## Drop the colummns which are not significant
data.drop(["ID"], axis = 1, inplace=True)

## Check the shape of he data
data.shape

## Identify the Categorical Columns and store them in a variable cat_cols and numerical into num_cols
num_cols=[]
cat_cols=[]

## Convert all the categorical columns to appropriate data type
data[cat_cols]=data[cat_cols].astype('category)

## Check the data types 
data.types

# Split the data into X and y
X=data.drop(['target'],axis=1)
y=data['target']

## Print the shape of X and y
print(X.shape,y.shape)
 
## Split the data into X_train, X_test, y_train, y_test with test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123,stratify=y)

## Shape of all X_train, y_train, X_test, y_test
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
10. Target attribute distribution

## Check for distribution of target in y_train
print(y_train.value_counts())
print(y_train.value_counts(normalize=True)*100)
## Check for distribution of target values in y_test
print(y_test.value_counts(normalize=True)*100)

11. Handling Missing Data

## Check the null values in train
X_train.isna().sum()
## Check the null values in test
X_test.isna().sum()
12. Missing Value Imputation
## Impute the Categorical columns with "mode"
si_cat = SimpleImputer(strategy='most_frequent')
si_cat.fit(X_train[cat_cols])

X_train_cat = pd.DataFrame(si_cat.transform(X_train[cat_cols]), columns=cat_cols)
X_test_cat = pd.DataFrame(si_cat.transform(X_test[cat_cols]), columns=cat_cols)
## Impute the Numerical columns with "median"
si_num = SimpleImputer(strategy='mean')
si_num.fit(X_train[num_cols])

X_train_num = pd.DataFrame(si_num.transform(X_train[num_cols]), columns=num_cols)
X_test_num = pd.DataFrame(si_num.transform(X_test[num_cols]), columns=num_cols)


13. Standardization for Numerical Columns

ss = StandardScaler()
ss.fit(X_train[num_cols])
X_train_std=ss.transform(X_train[num_cols])
X_test_std=ss.transform(X_test[num_cols)

14. OneHotEncoding for Categoriacl Columns

ohe = OneHotEncoder()
ohe.fit(X_train[cat_col_train])

X_train_ohe = ohe.transform(X_train[cat_col_train]).toarray()
X_test_ohe = ohe.transform(X_test[cat_col_train]).toarray()

print(X_train_ohe.shape)
print(X_test_ohe.shape)

15.  Label Encdoing
  
## LABEL ENCODING FOR TARGET COLUMN
le=LabelEncoder()
le.fit(y_train) 
y_train_le=le.transform(y_train)
y_test_le=le.transform(y_test)
y_train_le
y_test_le

16. Concatination
## Concatenating train,test- standardscalar, train, test-onehotencoder
X_train_con=np.concatenate([X_train_std,X_train_ohe],axis=1)
X_test_con=np.concatenate([X_test_std,X_test_ohe],axis=1)

print(X_train_con.shape)
print(X_test_con.shape)

17. Model Building
## For Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(class_weight='balanced')
log_reg.fit(X_train_con,y_train_le)
 
y_pred_test_log = log_reg.predict(X_test_con)
y_pred_train_log= log_reg.predict(X_train_con)

cm = confusion_matrix(y_train_le,y_pred_train_log)
cm
print(classification_report(y_train_le,y_pred_train_log))
print(classification_report(y_test_le,y_pred_test_log))

  
  
## For DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth =3, random_state = 123)
clf.fit(X_train_con, y_train_le)

y_pred_train_DT=clf.predict(X_train_con)
y_pred_test_DT=clf.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_DT))
print(classification_report(y_test_le,y_pred_test_DT))

  
  
  ## For KNeighber Classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_con, y_train_le)

y_pred_train_knn=knn.predict(X_train_con)
y_pred_test_knn=knn.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_knn))
print(classification_report(y_test_le,y_pred_test_knn))
 
 

## For Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
  
nbc = GaussianNB()
        nbc.fit(X_train_con, y_train_le)

y_pred_train_nbc = nbc.predict(X_train_con)
y_pred_test_nbc  = nbc.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_nbc))
print(classification_report(y_test_le,y_pred_test_nbc))



## For Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier() 
rfc.fit(X_train_con, y_train_le)
y_pred_train_rfc = rfc.predict(X_train_con)
y_pred_test_rfc  = rfc.predict(X_test_con)

print(classification_report(y_train_le,y_pred_train_rfc))
print(classification_report(y_test_le,y_pred_test_rfc))

Comments

Popular posts from this blog

Important Topics in Statistics for Data Sce

  Some important topics in Statistics: - Central Limit Theorem - Mean CLT Statement: For large sample sizes, the sampling distribution of means will approximate to normal distribution even if the population distribution is not normal. If we have a population with mean μ and standard deviation σ and take large random samples (n ≥ 30) from the population with replacement, then the distribution of the sample means will be approximately normally distributed. Why CLT is important? The field of statistics is based on fact that it is highly impossible to collect the data of entire population. Instead of doing that we can gather a subset of data from a population and use the statistics of that sample to draw conclusions about the population. In practice, the unexpected appearance of normal distribution from a population distribution is skewed (even heavily skewed). Many practices in statistics such as hypothesis testing make this assumption that the population on which they work is normall...