ASSIGNMENT
Question 1. Tips dataset
• Read the dataset “Tips.csv” as a dataframe “Data”.
• Extract the columns in the following sequence – Time, TotalBill, Tips.
• Plot a histogram for the variable ‘TotalBill’ to check which range has the highest
frequency.
• Draw a bar chart for the variable “Day”. Identify the category with the maximum
count.
• Demonstrate the data distributions using box, scatter plot, histogram, and bar chart
on iris
dataset.
• Demonstrate the correlation plot on iris dataset and perform exploratory visualization
giving an
overview of relationships among data with covariance analysis.
CODE:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Task 1: Read the "Tips.csv" dataset as a dataframe "Data"
Data = pd.read_csv("tips.csv")
# Task 2: Extract the columns in the sequence Time, TotalBill, Tips
Data = Data[['total_bill', 'tip', 'day']]
# Task 3: Plot a histogram for the variable 'total_bill'
plt.figure(figsize=(8, 6))
plt.hist(Data['total_bill'], bins=20, color='skyblue', edgecolor='black')
plt.title('Histogram of TotalBill')
plt.xlabel('TotalBill')
plt.ylabel('Frequency')
plt.show()
# Task 4: Draw a bar chart for the variable "day". Identify the category with the maximum
count.
plt.figure(figsize=(8, 6))
sns.countplot(x='day', data=Data, hue='day', palette='viridis', legend=False)
plt.title('Bar Chart for Day')
plt.xlabel('Day')
plt.ylabel('Count')
plt.show()
# Task 5: Load the iris dataset from the local file
iris = pd.read_csv("iris.csv")
# Task 6: Demonstrate data distributions using box, scatter plot, histogram, and bar chart
# Box plot
plt.figure(figsize=(12, 6))
sns.boxplot(data=iris[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']])
plt.title('Box Plot of Iris Dataset')
plt.show()
# Scatter plot
sns.pairplot(iris)
plt.title('Scatter Plot of Iris Dataset')
plt.show()
# Histogram
plt.figure(figsize=(8, 6))
sns.histplot(data=iris[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']], kde=True)
plt.title('Histogram of Iris Dataset')
plt.show()
# Bar chart
plt.figure(figsize=(8, 6))
#sns.countplot(x='Flowers', data=iris, palette='Set2')
sns.countplot(x='Flowers', data=iris, hue='Flowers', palette='Set2', legend=False)
plt.title('Bar Chart of Flowers in Iris Dataset')
plt.xlabel('Flowers')
plt.ylabel('Count')
plt.show()
# Load the iris dataset
iris = pd.read_csv("iris.csv")
# Task 7: Demonstrate the correlation plot on the iris dataset
# Convert the 'Flowers' column to numerical values
iris['Flowers'] = iris['Flowers'].astype('category').cat.codes
plt.figure(figsize=(10, 8))
sns.heatmap(iris.corr(), annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Plot of Iris Dataset')
plt.show()
# Task 8: Perform exploratory visualization with covariance analysis
# Pair plot with hue based on species
sns.pairplot(iris, hue='Flowers', palette='viridis', markers=["o", "s", "D"])
plt.suptitle('Pair Plot of Iris Dataset with Species Hue', y=1.02)
plt.show()
# Covariance matrix
covariance_matrix = iris.cov()
# Print covariance matrix
print("Covariance Matrix:")
print(covariance_matrix)
OUTPUT:
Covariance Matrix:
sepallength sepalwidth petallength petalwidth Flowers
sepallength 0.685694 -0.039268 1.273682 0.516904 0.530872
sepalwidth -0.039268 0.188004 -0.321713 -0.117981 -0.148993
petallength 1.273682 -0.321713 3.113179 1.296387 1.371812
petalwidth 0.516904 -0.117981 1.296387 0.582414 0.597987
Flowers 0.530872 -0.148993 1.371812 0.597987 0.671141
Question 2:. Split the Iris dataset into two the datasets - IrisTest_TrainData.csv,
IrisTest_TestData.csv.
• Read them as two separate data frames named Train_Data and Test_Data
respectively.
• Answer the following questions:
➢ How many missing values are there in Train_Data?
➢ What is the proportion of Setosa types in the Test_Data?
➢ What is the accuracy score of the K-Nearest Neighbor model (model_1) with 2/3
neighbors using Train_Data and Test_Data?
➢ Identify the list of indices of misclassified samples from the ‘model_1’.
➢ Build a logistic regression model (model_2) keeping the modelling steps constant.
Find
the accuracy of the model_2
CODE:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Load the Iris dataset
iris_data = pd.read_csv('iris.csv') # Replace with the correct file path
# Split the dataset into training and testing sets
Train_Data, Test_Data = train_test_split(iris_data, test_size=0.2, random_state=42)
# Save the split datasets to CSV files
Train_Data.to_csv('IrisTest_TrainData.csv', index=False)
Test_Data.to_csv('IrisTest_TestData.csv', index=False)
# 1. How many missing values are there in Train_Data?
missing_values_train = Train_Data.isnull().sum().sum()
print(f"Number of missing values in Train_Data: {missing_values_train}")
# 2. What is the proportion of Setosa types in Test_Data?
setosa_proportion = Test_Data[Test_Data['Flowers'] == 'Iris-setosa'].shape[0] / Test_Data.shape[0]
print(f"Proportion of Setosa types in Test_Data: {setosa_proportion}")
# 3. Train the K-Nearest Neighbor model (model_1) with 2/3 neighbors and calculate accuracy
features = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth']
model_1 = KNeighborsClassifier(n_neighbors=2) # You can adjust the number of neighbors
model_1.fit(Train_Data[features], Train_Data['Flowers'])
predictions_model_1 = model_1.predict(Test_Data[features])
accuracy_model_1 = accuracy_score(Test_Data['Flowers'], predictions_model_1)
print(f"Accuracy score of model_1: {accuracy_model_1}")
# 4. Identify the list of indices of misclassified samples from 'model_1'.
misclassified_indices = Test_Data.index[Test_Data['Flowers'] != predictions_model_1].tolist()
print(f"Misclassified sample indices from model_1: {misclassified_indices}")
# 5. Train the Logistic Regression model (model_2) and find its accuracy
model_2 = LogisticRegression()
model_2.fit(Train_Data[features], Train_Data['Flowers'])
predictions_model_2 = model_2.predict(Test_Data[features])
accuracy_model_2 = accuracy_score(Test_Data['Flowers'], predictions_model_2)
print(f"Accuracy score of model_2: {accuracy_model_2}")
OUTPUT:
Number of missing values in Train_Data: 0
Proportion of Setosa types in Test_Data: 0.3333333333333333
Accuracy score of model_1: 1.0
Misclassified sample indices from model_1: []
Accuracy score of model_2: 1.0
Question-3: Import a dataset from http://www.ats.ucla.edu/stat/data/binary.csv. Do the
Logistic Regression to find out relation between variables that are affecting the admission of a
student in an institute based on his or her GRE score, GPA obtained and rank of the student.
Also check the model is fit or not. Apply regression Model techniques to predict the data on
above dataset
CODE:
import pandas as pd
import statsmodels.api as sm
# Load the dataset
file_path = 'c:/users/zoya/PycharmProjects/pythonProject/assignques3/binary.csv'
# Load the dataset
data = pd.read_csv(file_path)
# Display the first few rows of the dataset
print(data.head())
# Add a constant term to the predictor
data['const'] = 1
# Define the predictor variables (GRE, GPA, Rank)
X = data[['gre', 'gpa', 'rank', 'const']]
# Define the target variable (admit)
y = data['admit']
# Fit logistic regression model
model = sm.Logit(y, X)
result = model.fit()
# Display the summary of the logistic regression
print(result.summary())
# Check the model fit
print("Model Fit:")
print(result.prsquared)
# Make predictions
predictions = result.predict(X)
# Display the predicted probabilities
print("Predicted Probabilities:")
print(predictions)
OUTPUT:
dmit gre gpa rank
0 0 380 3.61 3
1 1 660 3.67 3
2 1 800 4.00 1
3 1 640 3.19 4
4 0 520 2.93 4
Optimization terminated successfully.
Current function value: 0.574302
Iterations 6
Logit Regression Results
==================================================================
============
Dep. Variable: admit No. Observations: 400
Model: Logit Df Residuals: 396
Method: MLE Df Model: 3
Date: Fri, 15 Dec 2023 Pseudo R-squ.: 0.08107
Time: 11:21:30 Log-Likelihood: -229.72
converged: True LL-Null: -249.99
Covariance Type: nonrobust LLR p-value: 8.207e-09
==================================================================
============
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
gre 0.0023 0.001 2.101 0.036 0.000 0.004
gpa 0.7770 0.327 2.373 0.018 0.135 1.419
rank -0.5600 0.127 -4.405 0.000 -0.809 -0.311
const -3.4495 1.133 -3.045 0.002 -5.670 -1.229
==================================================================
============
Model Fit:
0.08107331276954477
Predicted Probabilities:
0 0.189553
1 0.317781
2 0.717814
3 0.148949
4 0.097954
...
395 0.490176
396 0.184989
397 0.186814
398 0.468108
399 0.325045
Length: 400, dtype: float64
Question4: Demonstrate Decision tree classification model and evaluate the
performance of classifier on Iris Dataset
CODE:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn import tree
# Load the dataset from the CSV file
iris_df = pd.read_csv('iris.csv')
# Separate features (X) and target variable (y)
X = iris_df.drop('Flowers', axis=1)
y = iris_df['Flowers']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
# Train the classifier on the training set
clf.fit(X_train, y_train)
# Make predictions on the test set
y_pred = clf.predict(X_test)
# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Display classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# Visualize the decision tree
plt.figure(figsize=(12, 8))
tree.plot_tree(clf, feature_names=X.columns, class_names=iris_df['Flowers'].unique(),
filled=True)
plt.show()
OUTPUT:
Accuracy: 1.00
Classification Report:
precision recall f1-score support
Iris-setosa 1.00 1.00 1.00 10
Iris-versicolor 1.00 1.00 1.00 9
Iris-virginica 1.00 1.00 1.00 11
accuracy 1.00 30
macro avg 1.00 1.00 1.00 30
weighted avg 1.00 1.00 1.00 30
Confusion Matrix:
[[10 0 0]
[ 0 9 0]
[ 0 0 11]]
Question 5: Demonstrate any of the Clustering model and evaluate the performance on
Iris dataset.
CODE:
# Import necessary libraries
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score
# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Apply K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)
# Visualize the clusters using PCA for dimensionality reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# Create a scatter plot of the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='viridis', s=100, alpha=0.8)
plt.title('K-Means Clustering on Iris Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
# Compare the clusters with the actual labels
ari_score = adjusted_rand_score(y, clusters)
print(f"Adjusted Rand Index (ARI): {ari_score:.2f}")
OUPUT: