In this project, I undertake the task of predicting the class of sampled flowers based on their attributes. The flowers under consideration are Irises. The dataset consists of three varieties of irises (versicolor, setosa, and virginica),and attributes such as sepal length, sepal width, petal length and petal width. I employ two classification algorithms for the task. They are (i) multinomial logistic regression, and (ii)linear support vector machings (svc). Information about both estimators/classifiers can be found in the documentation of sklearn, a standard python library.
import pandas as pd
import sklearn as skl
d = pd.read_csv('iris.data')
# Showing the last three plants in the dataset
d.tail(3)
#Renaming the columns of the dataset
d.columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Class']
d.tail(3)
# Generating an overall description/summary of the dataset
d.describe()
# Identifying the levels of the response variable since it's not shown in the summary above
d.groupby(['Class']).describe()
# Creating a new column in the dataset
d['New Class'] = d['Class']
d.tail(3)
# Replacing the values in the new column with 1 if Iris-setosa, 2 if Iris-versicolor, and
# 3 if Iris-virginica. By converting the response variable to numeric form, just like the
# explanatory variables, this step makes it possible to apply the multinomial logistic
# regression.
newd=d.replace({'New Class':{'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}})
newd.tail(3)
# Creating X,an explanatory variable matrix, and Y, a response variable matrix
X=newd.drop(columns=['Class','New Class'])
Y=newd['New Class']
# Splitting the dataset into training data and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)
# Creating a model object. This allows for various manipulations of the model, such as
# parameter manipulation.
from sklearn.linear_model import LogisticRegression
mod1= LogisticRegression()
# Checking the parameters of the model, which I leave as is.
mod1.get_params()
# Fitting the model to the training dataset
mod1.fit(X_train, Y_train)
Predicted_Class = mod1.predict(X_test)
Predicted_Class
# Checking how well the model has been fitted to the train dataset
mod1.score(X_train, Y_train)
# Checking how well the model predicts classes in the test dataset
mod1.score(X_test, Y_test)
# Checking model performance using a classification report
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(classification_report(Y_test,Predicted_Class))
# Checking model performance using a confusion matrix
print(confusion_matrix(Y_test, Predicted_Class))
# Checking model performance using accuracy score
print(accuracy_score(Y_test,Predicted_Class))
# Saving the model for future use
# import pickle
# pickle.dump(mod1, open('Logistic_Regression.pkl','wb'))
# Loading the model for future use.
# loaded_model = pickle.load(open('Logistic_Regression.pkl','rb'))
# loaded_model.score(X_test,Y_test)
# Duplicating the dataset so that I don't overwrite the old one
newd1 = newd
# Creating X,an explanatory variable matrix, and Y, a response variable matrix
X=newd1.drop(columns=['Class','New Class'])
Y=newd1['Class']
# Splitting the dataset into training data and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)
# Creating a model object.
from sklearn import svm
mod2 = svm.SVC()
#Checking the parameters of the model, which I leave as is.
mod2.get_params()
#Fitting the model to the training dataset
mod2.fit(X_train,Y_train)
Predicted_Class = mod2.predict(X_test)
Predicted_Class
# Checking how well the model learned patterns in the training dataset
mod2.score(X_train, Y_train)
# Checking how well the model learned patterns in the test data
mod2.score(X_test, Y_test)
# Checking model performance using a classification report
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(classification_report(Y_test,Predicted_Class))
# Checking model performance using a confusion matrix
print(confusion_matrix(Y_test, Predicted_Class))
# Checking model performance using accuracy score
print(accuracy_score(Y_test,Predicted_Class))
# Saving the model for future use
# import pickle
# pickle.dump(mod2, open('linearsvc.pkl','wb'))
# Loading the model for future use.
# loaded_model = pickle.load(open('linearsvc.pkl','rb'))
# loaded_model.score(X_test,Y_test)