# --- Program 1: FIND-S Algorithm ---
# Filename: find_s.py
"""
Implements the FIND-S algorithm to find the most specific hypothesis
that fits the given training data.
"""
import csv
print("Data in csv file is:")
data = []
with open("sedData.csv", "r") as fp: # Corrected file extension
read = csv.reader(fp)
for r in read:
data.append(r)
print(r)
print("\n")
n = len(data[0]) - 1
print(" No. of attributes are:", n)
print("Initial Hypothesis: ")
hypothesis = ['0'] * n
print(hypothesis)
for i in range(0, len(data)):
if (data[i][n] == 'yes'):
for j in range(0, n):
hypothesis[j] = data[i][j]
break
print(hypothesis)
print("\n")
print("After every iteration: ")
for i in range(0, len(data)):
if (data[i][n] == 'yes'):
for j in range(0, n):
if (hypothesis[j] != data[i][j]):
hypothesis[j] = '?'
print(hypothesis)
print("\n")
print("Final Hypothesis:")
print(hypothesis)
# --- Program 2: Candidate Elimination Algorithm ---
# Filename: candidate_elimination.py
"""
Implements the Candidate Elimination Algorithm to find the most general
and most specific hypotheses that are consistent with the training data.
"""
import numpy as np
import pandas as pd
data = pd.read_csv('ML2.CSV') # Corrected CSV name
concepts = np.array(data)[:, :-1]
print("Instances are:\n", concepts)
target = np.array(data)[:, -1]
def learn(concepts, target):
specific_h = concepts[0].copy()
print("\nInitialization of specific_h & general_h")
print("Specific boundary:\n", specific_h)
general_h = [["?" for i in range(len(specific_h))] for i in range(len(specific_h))]
print("General boundary:\n", general_h)
for i, h in enumerate(concepts):
print("\nInstance", i+1, "is", h)
if target[i] == "yes":
print("Instance is positive")
for x in range(len(specific_h)):
if h[x] != specific_h[x]:
specific_h[x] = '?'
general_h[x][x] = '?'
elif target[i] == "no":
print("Instance is negative")
for x in range(len(specific_h)):
if h[x] != specific_h[x]:
general_h[x][x] = specific_h[x]
else:
general_h[x][x] = '?'
print("Specific boundary after", i+1, "instance:\n", specific_h)
print("General boundary after", i+1, "instance:\n", general_h)
print("\n")
indices = [i for i, val in enumerate(general_h) if val == ['?', '?', '?', '?', '?']] #
Adjusted to 5 question marks
for i in indices:
general_h.remove(['?', '?', '?', '?', '?']) #Adjusted to 5 question marks
return specific_h, general_h
s_final, g_final = learn(concepts, target)
print("Final specific-h:\n", s_final)
print("Final general_h:\n", g_final)
# --- Program 3: Decision Tree ---
# Filename: decision_tree.py
"""
Implements a simple decision tree using scikit-learn.
"""
from sklearn.tree import DecisionTreeClassifier
import numpy as np
X = np.array([[1, 1, 1],
[1, 0, 1],
[0, 1, 0],
[0, 0, 1],
[1, 1, 0]])
y = np.array([1, 1, 0, 0, 1])
tree = DecisionTreeClassifier(criterion='entropy') # Using criterion entropy
tree.fit(X, y)
new_sample = np.array([[1, 0, 1]])
predicted_class = tree.predict(new_sample)
print("Predicted class:", predicted_class[0])
# --- Program 4: Linear Regression ---
# Filename: linear_regression.py
"""
Performs linear regression on a given dataset using scikit-learn.
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np
data = pd.read_csv("lab3.csv") # Changed filename to lab3.csv
print(data)
diameter = data['diameter'].values.reshape(-1, 1)
price = data['price'].values
model = LinearRegression()
model.fit(diameter, price)
intercept = model.intercept_
slope = model.coef_[0]
print("Intercept:", intercept)
print("Slope:", slope)
predictions = model.predict(diameter)
mse = np.mean((predictions - price) ** 2)
print("Mean squared Error:", mse)
plt.scatter(diameter, price, color='blue')
plt.plot(diameter, predictions, color='red')
plt.xlabel('Diameter')
plt.ylabel('Price')
plt.title("Linear Regression")
plt.show()
new_diameter = np.array([20]).reshape(-1, 1) # Corrected: Reshape the input
future_price = model.predict(new_diameter)
print("Predicted price for a diameter of 20:", future_price[0])
# --- Program 5: Logistic Regression ---
# Filename: logistic_regression.py
"""
Performs logistic regression on a given dataset using scikit-learn.
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
data = pd.read_csv('labs.csv')
print(data)
X = data[['studyhours']]
y = data['examresult']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
new_data = pd.DataFrame({'studyhours': [1, 7, 9]})
new_predictions = model.predict(new_data)
print("New predictions:")
for i, prediction in enumerate(new_predictions):
print("Instance {}: predicted Result: {}".format(i+1, prediction))
# --- Program 6: Binary classifier (Likely another Logistic Regression example,
slightly different) ---
# Filename: binary_classifier.py
"""
Performs binary classification (using Logistic Regression) on a dataset.
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
data = pd.read_csv('lab3.csv')
print(data)
X = data[['studyhours']]
Y = data['examresult'] # Corrected target variable name
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,
random_state=42) # Corrected test_size
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: ", accuracy)
new_data = pd.DataFrame({'studyhours': [1, 7, 9]})
new_predictions = model.predict(new_data)
print("New predictions:")
for i, prediction in enumerate(new_predictions):
print("Instance {}: Predicted Result {}".format(i+1, prediction))
# --- Program 7: Bias, Variance, Cross-Validation ---
# Filename: bias_variance_cv.py
"""
Demonstrates how to calculate bias and variance using cross-validation
for a linear regression model.
"""
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression #Added
Logistic Regression import
from statistics import mean, stdev
data = pd.read_csv("wineQT.csv")
X_set = data.drop('quality', axis=1) # Corrected to X_set
y_set = data['quality']
# Linear Regression
model = LinearRegression()
scores = cross_val_score(model, X_set, y_set, cv=10)
print("Linear Regression Scores:", scores)
print("Linear Regression Bias (Mean):", mean(scores))
print("Linear Regression Variance (StDev):", stdev(scores))
# K-fold Cross Validation (K-list wasn't used correctly in the images)
# Here's a correct way to show how bias and variance change with different folds
k_values = [2, 5, 10, 20] # Example values for K
bias_scores = []
variance_scores = []
for k in k_values:
model = LinearRegression() # Create new model for each K
scores = cross_val_score(model, X_set, y_set, cv=k)
bias_scores.append(mean(scores))
variance_scores.append(stdev(scores))
print("\nLinear Regression K-Fold Validation Results:")
for i in range(len(k_values)):
print(f"K={k_values[i]}: Bias={bias_scores[i]:.4f}, Variance={variance_scores[i]:.4f}")
# --- Program 8: K-Nearest Neighbors (KNN) ---
# Filename: knn.py
"""
Implements the K-Nearest Neighbors algorithm on the Iris dataset.
"""
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=0)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Correct predictions:")
for i in range(len(y_test)):
if y_test[i] == y_pred[i]:
print("True label:", iris.target_names[y_test[i]], "-Predicted label:",
iris.target_names[y_pred[i]])
print("\nWrong Predictions:")
for i in range(len(y_test)):
if y_test[i] != y_pred[i]:
print("True label:", iris.target_names[y_test[i]], "-predicted label:",
iris.target_names[y_pred[i]])
# --- Program 9: Locally Weighted Regression ---
# Filename: locally_weighted_regression.py
"""
Implements Locally Weighted Regression.
"""
import numpy as np
import matplotlib.pyplot as plt
def lwr(x_train, y_train, x_test, tau):
y_pred = np.zeros(len(x_test))
for i, test_point in enumerate(x_test):
weights = np.exp(-((x_train - test_point) ** 2) / (2 * tau ** 2))
X = np.vstack([np.ones_like(x_train), x_train]).T
W = np.diag(weights)
try:
theta = np.linalg.inv(X.T @ W @ X) @ (X.T @ (W @ y_train))
y_pred[i] = np.array([1, test_point]) @ theta
except np.linalg.LinAlgError:
print("Singular matrix encountered. Adjusting tau or data may be needed.")
y_pred[i] = 0 # or some other default value
return y_pred
np.random.seed(42)
x_train = np.linspace(0, 10, 100)
y_train = 2 * np.sin(x_train) + np.random.normal(0, 0.2, 100)
x_test = np.linspace(0, 10, 50)
tau = 0.1 # Corrected tau value (0.01 was likely too small)
y_pred = lwr(x_train, y_train, x_test, tau)
plt.scatter(x_train, y_train, color='blue')
plt.plot(x_test, y_pred, color='red')
plt.title("Locally Weighted Regression")
plt.xlabel('x')
plt.ylabel('y')
plt.show()
# --- Program 10: Naive Bayes ---
# Filename: naive_bayes.py
"""
Implements Naive Bayes classification on the Iris dataset.
"""
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score
iris = load_iris()
X, y = iris.data, iris.target
target_names = iris.target_names
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
print('Accuracy:', accuracy)
print("Precision:", precision)
# predict for new values
new_data = [[5.1, 3.5, 1.4, 0.2], [6.9, 3.2, 5.7, 2.3]] # Example Data
new_prediction = nb_classifier.predict(new_data)
print("New predictions:", [target_names[prediction] for prediction in new_prediction])
# --- Program 11: EM and K-Means Clustering ---
# Filename: em_kmeans.py
"""
Compares EM (Gaussian Mixture Model) and K-Means clustering on
a heart disease dataset.
"""
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
# Load the data, handling potential errors
try:
data = pd.read_csv("heart.csv") # Make sure 'heart.csv' is in the same directory
except FileNotFoundError:
print("Error: The file 'heart.csv' was not found.")
exit()
except pd.errors.EmptyDataError:
print("Error: The file 'heart.csv' is empty.")
exit()
except pd.errors.ParserError:
print("Error: The file 'heart.csv' could not be parsed. Check the format.")
exit()
features = ['trestbps', 'chol'] # Features for clustering
X = data[features]
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# K-Means Clustering
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans.fit(X_scaled)
kmeans_labels = kmeans.labels_
# EM Clustering (Gaussian Mixture Model)
em = GaussianMixture(n_components=2, random_state=42)
em.fit(X_scaled)
em_labels = em.predict(X_scaled)
# Create a colormap
colormap = np.array(['red', 'green'])
# Create the plot
plt.figure(figsize=(14, 5))
# Original Data
plt.subplot(1, 3, 1)
plt.scatter(X['trestbps'], X['chol'], c=colormap[data['target']], s=40)
plt.title('Original Data')
plt.xlabel('trestbps')
plt.ylabel('chol')
# K-Means Clustering
plt.subplot(1, 3, 2)
plt.scatter(X['trestbps'], X['chol'], c=colormap[kmeans_labels], s=40)
plt.title('K-Means Clustering')
plt.xlabel('trestbps')
plt.ylabel('chol')
# EM Clustering
plt.subplot(1, 3, 3)
plt.scatter(X['trestbps'], X['chol'], c=colormap[em_labels], s=40)
plt.title('EM Clustering')
plt.xlabel('trestbps')
plt.ylabel('chol')
plt.show()
# --- Program 12: Exploratory Data Analysis (EDA) ---
# Filename: exploratory_data_analysis.py
"""
Performs Exploratory Data Analysis on the Iris dataset using pandas and matplotlib.
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load data, with error handling
try:
df = pd.read_csv('IRIS.csv') # Change filename to IRIS.csv
except FileNotFoundError:
print("Error: The file 'IRIS.csv' was not found.")
exit()
except pd.errors.EmptyDataError:
print("Error: The file 'IRIS.csv' is empty.")
exit()
except pd.errors.ParserError:
print("Error: The file 'IRIS.csv' could not be parsed.")
exit()
print("Exploratory data analysis of iris dataset:\n")
print("First few rows:\n", df.head())
print("\nData information:\n", df.info())
print("\nMissing values:\n", df.isnull().sum())
print("\nColumn names:\n", df.columns)
print("\nValue counts for 'species':\n", df['species'].value_counts())
print("\nData types of each column:\n", df.dtypes)
print("\nCorrelation matrix:\n", df.corr(numeric_only=True)) # numeric_only added
# --- Program 13: Bayesian Network ---
# Filename: bayesian_network.py
"""
Constructs a Bayesian network for diagnosing heart disease using pgmpy.
"""
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
# Load the data, handling possible errors
try:
heartDisease = pd.read_csv('heart.csv') # Replace with correct path
except FileNotFoundError:
print("Error: The file 'heart.csv' was not found.")
exit()
except pd.errors.EmptyDataError:
print("Error: The file 'heart.csv' is empty.")
exit()
except pd.errors.ParserError:
print("Error: The file 'heart.csv' could not be parsed.")
exit()
heartDisease = heartDisease.replace('?', np.nan)
print("Few examples from dataset are:")
print(heartDisease.head())
model = BayesianModel([('age', 'trestbps'), ('age', 'fbs'), ('sex', 'trestbps'),
('exang', 'trestbps'), ('trestbps', 'heartdisease'), ('fbs', 'heartdisease'),
('heartdisease', 'restecg'), ('heartdisease', 'thalach'), ('heartdisease',
'chol')])
print("\nLearning CPD using Maximum Likelihood Estimators")
model.fit(heartDisease, estimator=MaximumLikelihoodEstimator)
print("\nInferencing with Bayesian Network!")
HeartDisease_infer = VariableElimination(model)
print("\n1. Probability of Heart Disease given age = 30:")
q = HeartDisease_infer.query(variables=['heartdisease'], evidence={'age': 30})
print(q['heartdisease'])
print("\n2. Probability of Heart Disease given cholesterol = 100:")
q = HeartDisease_infer.query(variables=['heartdisease'], evidence={'chol': 100})
print(q['heartdisease'])
# --- Program 14: Support Vector Machine (SVM) ---
# Filename: svm_classification.py
"""
Implements Support Vector Machine (SVM) classification on the Iris dataset.
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Load data
iris = load_iris()
X = iris.data[:, :2] # Use only the first two features for visualization
y = iris.target
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=0)
# Scale data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Train SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0, random_state=0)
svm_classifier.fit(X_train, y_train)
# Make predictions
y_pred_train = svm_classifier.predict(X_train)
y_pred_test = svm_classifier.predict(X_test)
# Calculate accuracy
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Training accuracy:", accuracy_train)
print("Testing accuracy:", accuracy_test)
# Plot decision boundary (adapted from example)
def plot_decision_boundary(classifier, X, y):
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X[:, 0], X[:, 1], c=y, marker='o', edgecolors='k')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("Decision Boundary")
plt.show()
plot_decision_boundary(svm_classifier, X_train, y_train)
# --- Program 15: Principal Component Analysis (PCA) ---
# Filename: pca_analysis.py
"""
Demonstrates Principal Component Analysis (PCA) using scikit-learn.
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
# Example 1 (Simple array)
X = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
pca = PCA(n_components=2)
pca.fit(X)
X_transformed = pca.transform(X)
print("Original data:\n", X)
print("\nTransformed data:\n", X_transformed)
# Example 2 (Iris dataset)
iris = load_iris()
X = iris.data
y = iris.target
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.figure(figsize=(8, 6))
for i in range(len(iris.target_names)):
plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], label=iris.target_names[i])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Iris Dataset')
plt.legend()
plt.show()