Start coding or generate with AI.
Double-click (or enter) to edit
chatgpt
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
# Load the Iris dataset from sklearn
iris_sklearn = load_iris()
# Convert to a pandas DataFrame
iris_df = pd.DataFrame(data=iris_sklearn.data, columns=iris_sklearn.feature_names)
# Add the target variable
iris_df['species'] = pd.Categorical.from_codes(iris_sklearn.target, iris_sklearn.target_names)
# Display the first 5 rows of the dataset
print(iris_df.head())
# Identify features and labels
features = iris_df.columns[:-1]
label = iris_df.columns[-1]
print(f"Features: {features}")
print(f"Label: {label}")
# Plot the distribution of each feature using histograms
iris_df[features].hist(figsize=(10, 8))
plt.suptitle('Feature Distributions')
plt.show()
# Visualize the dataset using a scatterplot matrix
sns.pairplot(iris_df, hue='species', markers=["o", "s", "D"])
plt.suptitle('Scatterplot Matrix')
plt.show()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
species
0 setosa
1 setosa
2 setosa
3 setosa
4 setosa
Features: Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
'petal width (cm)'],
dtype='object')
Label: species
# Part 1: Statistical Features and Feature Selection
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# Load the Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# Step 2: Compute statistical features
statistical_summary = {
'Mean': iris_df.mean(),
'Median': iris_df.median(),
'Variance': iris_df.var(),
'Standard Deviation': iris_df.std(),
'Minimum': iris_df.min(),
'Maximum': iris_df.max()
}
# Convert to DataFrame for better visualization
statistical_summary_df = pd.DataFrame(statistical_summary)
print("Statistical Features:\n", statistical_summary_df)
# Step 3: Normalize the features
# Min-Max Scaling
min_max_scaler = MinMaxScaler()
iris_df_minmax = pd.DataFrame(min_max_scaler.fit_transform(iris_df), columns=iris.feature_names)
# Z-score Normalization
z_score_scaler = StandardScaler()
iris_df_zscore = pd.DataFrame(z_score_scaler.fit_transform(iris_df), columns=iris.feature_names)
# Display normalized data
print("\nMin-Max Normalized Data:\n", iris_df_minmax.head())
print("\nZ-score Normalized Data:\n", iris_df_zscore.head())
# Step 4: Discussing feature relevance
# Based on the statistical properties, we can discuss the relevance of features
# For example, we can look at the variance and mean values to determine which features are more spread out and have higher v
feature_relevance = {
'Feature': iris.feature_names,
'Mean': statistical_summary_df['Mean'],
'Variance': statistical_summary_df['Variance']
}
feature_relevance_df = pd.DataFrame(feature_relevance)
print("\nFeature Relevance based on Mean and Variance:\n", feature_relevance_df)
# Discussion on feature relevance
print("\nDiscussion on Feature Relevance:")
for index, row in feature_relevance_df.iterrows():
print(f"{row['Feature']}: Mean = {row['Mean']:.2f}, Variance = {row['Variance']:.2f}")
if row['Variance'] > 0.5: # Arbitrary threshold for variance
print(f" - This feature has a relatively high variance, indicating it may be useful for classification.")
else:
print(f" - This feature has a low variance, indicating it may not be very useful for classification.")
Statistical Features:
Mean Median Variance Standard Deviation Minimum \
sepal length (cm) 5.843333 5.80 0.685694 0.828066 4.3
sepal width (cm) 3.057333 3.00 0.189979 0.435866 2.0
petal length (cm) 3.758000 4.35 3.116278 1.765298 1.0
petal width (cm) 1.199333 1.30 0.581006 0.762238 0.1
Maximum
sepal length (cm) 7.9
sepal width (cm) 4.4
petal length (cm) 6.9
petal width (cm) 2.5
Min-Max Normalized Data:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 0.222222 0.625000 0.067797 0.041667
1 0.166667 0.416667 0.067797 0.041667
2 0.111111 0.500000 0.050847 0.041667
3 0.083333 0.458333 0.084746 0.041667
4 0.194444 0.666667 0.067797 0.041667
Z-score Normalized Data:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 -0.900681 1.019004 -1.340227 -1.315444
1 -1.143017 -0.131979 -1.340227 -1.315444
2 -1.385353 0.328414 -1.397064 -1.315444
3 -1.506521 0.098217 -1.283389 -1.315444
4 -1.021849 1.249201 -1.340227 -1.315444
Feature Relevance based on Mean and Variance:
Feature Mean Variance
sepal length (cm) sepal length (cm) 5.843333 0.685694
sepal width (cm) sepal width (cm) 3.057333 0.189979
petal length (cm) petal length (cm) 3.758000 3.116278
petal width (cm) petal width (cm) 1.199333 0.581006
Discussion on Feature Relevance:
sepal length (cm): Mean = 5.84, Variance = 0.69
- This feature has a relatively high variance, indicating it may be useful for classification.
sepal width (cm): Mean = 3.06, Variance = 0.19
- This feature has a low variance, indicating it may not be very useful for classification.
petal length (cm): Mean = 3.76, Variance = 3.12
- This feature has a relatively high variance, indicating it may be useful for classification.
petal width (cm): Mean = 1.20, Variance = 0.58
- This feature has a relatively high variance, indicating it may be useful for classification.
# pattern recognation
# Part 2: Pattern Recognition
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Load the Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target
# Step 2: Data Splitting
# Split the dataset into training and testing sets (70% training, 30% testing)
X = iris_df[iris.feature_names]
y = iris_df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Step 3: Model Implementation
# K-Nearest Neighbors (KNN)
knn_params = {'n_neighbors': range(1, 21)}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_grid.fit(X_train_scaled, y_train)
# Support Vector Machine (SVM)
svm_params = {'kernel': ['linear', 'rbf', 'poly'], 'C': [0.1, 1, 10]}
svm_grid = GridSearchCV(SVC(), svm_params, cv=5)
svm_grid.fit(X_train_scaled, y_train)
# Step 4: Evaluation
# Evaluate KNN
knn_best = knn_grid.best_estimator_
y_pred_knn = knn_best.predict(X_test_scaled)
# Evaluate SVM
svm_best = svm_grid.best_estimator_
y_pred_svm = svm_best.predict(X_test_scaled)
# Calculate metrics
metrics = {
'Model': ['KNN', 'SVM'],
'Accuracy': [accuracy_score(y_test, y_pred_knn), accuracy_score(y_test, y_pred_svm)],
'Precision': [precision_score(y_test, y_pred_knn, average='weighted'), precision_score(y_test, y_pred_svm, average='weigh
'Recall': [recall_score(y_test, y_pred_knn, average='weighted'), recall_score(y_test, y_pred_svm, average='weighted')],
'F1 Score': [f1_score(y_test, y_pred_knn, average='weighted'), f1_score(y_test, y_pred_svm, average='weighted')]
}
metrics_df = pd.DataFrame(metrics)
print("\nEvaluation Metrics:\n", metrics_df)
# Step 5: Plot confusion matrices
def plot_confusion_matrix(y_true, y_pred, model_name):
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title(f'Confusion Matrix for {model_name}')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
plot_confusion_matrix(y_test, y_pred_knn, 'KNN')
plot_confusion_matrix(y_test, y_pred_svm, 'SVM')
Evaluation Metrics:
Model Accuracy Precision Recall F1 Score
0 KNN 1.000000 1.000000 1.000000 1.000000
1 SVM 0.977778 0.979365 0.977778 0.977745
# 4. Comparison and Conclusion
# Part 3: Model Comparison and Conclusion
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Load the Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target
# Step 2: Data Splitting
# Split the dataset into training and testing sets (70% training, 30% testing)
X = iris_df[iris.feature_names]
y = iris_df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Step 3: Model Implementation
# K-Nearest Neighbors (KNN)
knn_params = {'n_neighbors': range(1, 21)}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_grid.fit(X_train_scaled, y_train)
# Support Vector Machine (SVM)
svm_params = {'kernel': ['linear', 'rbf', 'poly'], 'C': [0.1, 1, 10]}
svm_grid = GridSearchCV(SVC(), svm_params, cv=5)
svm_grid.fit(X_train_scaled, y_train)
# Step 4: Evaluation
# Evaluate KNN
knn_best = knn_grid.best_estimator_
y_pred_knn = knn_best.predict(X_test_scaled)
# Evaluate SVM
svm_best = svm_grid.best_estimator_
y_pred_svm = svm_best.predict(X_test_scaled)
# Calculate metrics
metrics = {
'Model': ['KNN', 'SVM'],
'Accuracy': [accuracy_score(y_test, y_pred_knn), accuracy_score(y_test, y_pred_svm)],
'Precision': [precision_score(y_test, y_pred_knn, average='weighted'), precision_score(y_test, y_pred_svm, average='weig
'Recall': [recall_score(y_test, y_pred_knn, average='weighted'), recall_score(y_test, y_pred_svm, average='weighted')],
'F1 Score': [f1_score(y_test, y_pred_knn, average='weighted'), f1_score(y_test, y_pred_svm, average='weighted')]
}
metrics_df = pd.DataFrame(metrics)
print("\nEvaluation Metrics:\n", metrics_df)
# Step 5: Plotting the comparison
metrics_df.set_index('Model').plot(kind='bar', figsize=(10, 6))
plt.title('Model Comparison')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.grid(axis='y')
plt.show()
# Step 6: Discussion
best_model = metrics_df.loc[metrics_df['Accuracy'].idxmax()]
print("\nBest Model:")
print(f"Model: {best_model['Model']}")
print(f"Accuracy: {best_model['Accuracy']:.2f}")
print(f"Precision: {best_model['Precision']:.2f}")
print(f"Recall: {best_model['Recall']:.2f}")
print(f"F1 Score: {best_model['F1 Score']:.2f}")
Evaluation Metrics:
Model Accuracy Precision Recall F1 Score
0 KNN 1.000000 1.000000 1.000000 1.000000
1 SVM 0.977778 0.979365 0.977778 0.977745
Best Model:
Model: KNN
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
# delivarables:
# Part 1: Feature Extraction and Statistical Analysis
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# Load the Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# Step 2: Compute statistical features
statistical_summary = {
'Mean': iris_df.mean(),
'Median': iris_df.median(),
'Variance': iris_df.var(),
'Standard Deviation': iris_df.std(),
'Minimum': iris_df.min(),
'Maximum': iris_df.max()
}
# Convert to DataFrame for better visualization
statistical_summary_df = pd.DataFrame(statistical_summary)
#print("Statistical Features:\n", statistical_summary_df)
# Step 3: Normalize the features
# Min-Max Scaling
min_max_scaler = MinMaxScaler()
iris_df_minmax = pd.DataFrame(min_max_scaler.fit_transform(iris_df), columns=iris.feature_names)
# Z-score Normalization
z_score_scaler = StandardScaler()
iris_df_zscore = pd.DataFrame(z_score_scaler.fit_transform(iris_df), columns=iris.feature_names)
# Display normalized data
#print("\nMin-Max Normalized Data:\n", iris_df_minmax.head())