FREQUENCY DISTRIBUTION
import numpy as np
import pandas as pd
ef make_frequency_distribution(data, user_input=None, extra=True):
"""
Function to make frequency distribution.
Args:
data (numpy.array): data containing records.
user_input (tuple, optional):
user_input for start_value, end_value, total_classes.
Defaults to None.
extra (bool, optional): to make extra columns like cumulative,
relative frequency.
Returns:
pandas.DataFrame: required frequency distribution.
"""
## total number of observations
length = len(data)
## lowest and highest number in the data
lowest = min(data)
highest = max(data)
## total number of class
if user_input == None:
total_classes = int(np.sqrt(length))
else:
lowest, highest, total_classes = user_input
## range of the data
range_ = highest - lowest
print(f"Start value: {lowest}")
print(f"End value: {highest}")
print(f"Range: {range_}")
print(f"Total Number of Classes: {total_classes}")
## calculate width
width = range_ / total_classes
## list of all class intervals
class_intervals = [
np.round(start,3) for start in np.linspace(lowest, highest,
total_classes+1)
]
print(f"Class Width = {np.round(width, 3)}", end="\n\n")
## calculate frequency for each class
hist, _ = np.histogram(data, bins=class_intervals)
## frequency table
df = pd.DataFrame(
{
"Class Intervals": [
f"{first} - under {second}" \
for first, second in zip(class_intervals,
class_intervals[1:])
],
"Frequency": hist
}
)
if extra:
## class midpoint
df["Class Midpoint"] = df["Class Intervals"].apply(
lambda x: (
( float(x.split(' ')[0]) + float(x.split(' ')[-1]) ) / 2
)
)
## relative frequency
df["Relative Frequency"] = df["Frequency"] / df["Frequency"].sum()
## cumulative frequency
df["Cumulative Frequency"] = df["Frequency"].cumsum()
return df
## data
test_scores = np.array([
52, 92, 84, 74, 65, 55, 78, 95, 62,
72, 64, 74, 82, 94, 71, 79, 73, 94,
77, 53, 77, 87, 97, 57, 72, 89, 76,
91, 86, 99, 71, 73, 58, 76, 33, 78, 69
])
## without specifying user input
make_frequency_distribution(test_scores)
Start value: 33
End value: 99
Range: 66
Total Number of Classes: 6
Class Width = 11.0
Class Frequenc Class Relative Cumulative
Intervals y Midpoint Frequency Frequency
33.0 - under
0 1 38.5 0.027027 1
44.0
44.0 - under
1 2 49.5 0.054054 3
55.0
55.0 - under
2 6 60.5 0.162162 9
66.0
66.0 - under
3 11 71.5 0.297297 20
77.0
77.0 - under
4 9 82.5 0.243243 29
88.0
88.0 - under
5 8 93.5 0.216216 37
99.0
HYPOTHESIS TESING
PROBABILITY DISTRIBUTION
REGRESSION
import numpy as np
import matplotlib.pyplot as plt
def generate_data(size=100, seed=None):
if seed is not None:
np.random.seed(seed)
independent_variable = np.random.rand(size) * 30 # Independent
variable with variability
noise = np.random.randn(size) * 5 # Noise for variability in the
dependent variable
dependent_variable = 2 * independent_variable + 10 + noise #
Dependent variable with a linear relationship
return independent_variable, dependent_variable
def calculate_regression_coefficients(x, y):
x_mean, y_mean = np.mean(x), np.mean(y)
numerator = np.sum((x - x_mean) * (y - y_mean))
denominator = np.sum((x - x_mean)**2)
slope = numerator / denominator
intercept = y_mean - slope * x_mean
return slope, intercept
def create_regression_line(x, slope, intercept):
return slope * x + intercept
def visualize_data_and_regression(x, y, slope, intercept):
plt.scatter(x, y, alpha=0.7, label='Data Points')
regression_line = create_regression_line(x, slope, intercept)
plt.plot(x, regression_line, color='red', label='Regression Line')
plt.title('Scatterplot with Regression Line')
plt.xlabel('Independent Variable')
plt.ylabel('Dependent Variable')
plt.legend()
#plt.grid(True)
plt.show()
def predict(slope, intercept, x):
return slope * x + intercept
def evaluate_model(actual, predicted):
mse = np.mean((actual - predicted)**2)
r_squared = 1 - (np.sum((actual - predicted)**2) / np.sum((actual -
np.mean(actual))**2))
return mse, r_squared
def draw_conclusions(slope, intercept, mse, r_squared):
print(f"Slope: {slope}")
print(f"Intercept: {intercept}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared: {r_squared}")
# Main Program
independent_variable, dependent_variable = generate_data(seed=42)
slope, intercept = calculate_regression_coefficients(independent_variable,
dependent_variable)
visualize_data_and_regression(independent_variable, dependent_variable,
slope, intercept)
predictions = predict(slope, intercept, independent_variable)
mse, r_squared = evaluate_model(dependent_variable, predictions)
draw_conclusions(slope, intercept, mse, r_squared)
CORRELATION
import numpy as np
import matplotlib.pyplot as plt
# Generate Data
def generate_data(size=100, seed=None):
if seed is not None:
np.random.seed(seed)
variable1 = np.random.rand(size) * 30 # First variable with
variability
variable2 = variable1 + np.random.randn(size) * 10 # Second variable
with variability and a linear relationship
return variable1, variable2
# Calculate Correlation Coefficient
def calculate_correlation_coefficient(x, y):
covariance_matrix = np.cov(x, y)
correlation_coefficient = covariance_matrix[0, 1] / (np.std(x) *
np.std(y))
return correlation_coefficient
# Create Scatter plot
def create_scatter_plot(x, y):
plt.scatter(x, y, alpha=0.7)
plt.title('Scatterplot of Variable 1 vs Variable 2')
plt.xlabel('Variable 1')
plt.ylabel('Variable 2')
plt.grid(True)
plt.show()
# Interpret Scatter plot
def interpret_correlation_coefficient(correlation_coefficient):
if correlation_coefficient> 0:
correlation_type = "positive"
elif correlation_coefficient< 0:
correlation_type = "negative"
else:
correlation_type = "no apparent"
return correlation_type
# Experiment
seed = 42 # Change the seed to observe different scenarios
variable1, variable2 = generate_data(seed=seed)
correlation_coefficient = calculate_correlation_coefficient(variable1,
variable2)
create_scatter_plot(variable1,variable2)
print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
correlation_type =
interpret_correlation_coefficient(correlation_coefficient)
print(f"The scatterplot shows a {correlation_type} correlation.")
ONE WAY ANOVA- COMPLETELY RANDOMIZED DESIGN
import pandas as pd
import numpy as np
import math
from scipy import stats
import scipy
import statsmodels.api as sm
from statsmodels.formula.api import ols
from matplotlib import pyplot as plt
data=pd.read_excel('/content/tm.xlsx')
data