KEMBAR78
Fha-Pyhton Program Unit 1-4 | PDF | Coefficient Of Determination | Dependent And Independent Variables
0% found this document useful (0 votes)
20 views13 pages

Fha-Pyhton Program Unit 1-4

The document provides Python code for various statistical analyses, including frequency distribution, regression, correlation, and one-way ANOVA. It includes functions for generating data, calculating regression coefficients, visualizing data, and evaluating models. Additionally, it demonstrates how to interpret correlation coefficients and perform ANOVA using data from an Excel file.

Uploaded by

selvanveera123
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
20 views13 pages

Fha-Pyhton Program Unit 1-4

The document provides Python code for various statistical analyses, including frequency distribution, regression, correlation, and one-way ANOVA. It includes functions for generating data, calculating regression coefficients, visualizing data, and evaluating models. Additionally, it demonstrates how to interpret correlation coefficients and perform ANOVA using data from an Excel file.

Uploaded by

selvanveera123
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 13

FREQUENCY DISTRIBUTION

import numpy as np
import pandas as pd
ef make_frequency_distribution(data, user_input=None, extra=True):
"""
Function to make frequency distribution.

Args:
data (numpy.array): data containing records.
user_input (tuple, optional):
user_input for start_value, end_value, total_classes.
Defaults to None.
extra (bool, optional): to make extra columns like cumulative,
relative frequency.

Returns:
pandas.DataFrame: required frequency distribution.
"""
## total number of observations
length = len(data)

## lowest and highest number in the data


lowest = min(data)
highest = max(data)

## total number of class


if user_input == None:
total_classes = int(np.sqrt(length))
else:
lowest, highest, total_classes = user_input

## range of the data


range_ = highest - lowest

print(f"Start value: {lowest}")


print(f"End value: {highest}")
print(f"Range: {range_}")
print(f"Total Number of Classes: {total_classes}")

## calculate width
width = range_ / total_classes

## list of all class intervals


class_intervals = [
np.round(start,3) for start in np.linspace(lowest, highest,
total_classes+1)
]
print(f"Class Width = {np.round(width, 3)}", end="\n\n")

## calculate frequency for each class


hist, _ = np.histogram(data, bins=class_intervals)

## frequency table
df = pd.DataFrame(
{
"Class Intervals": [
f"{first} - under {second}" \
for first, second in zip(class_intervals,
class_intervals[1:])
],
"Frequency": hist
}
)

if extra:
## class midpoint
df["Class Midpoint"] = df["Class Intervals"].apply(
lambda x: (
( float(x.split(' ')[0]) + float(x.split(' ')[-1]) ) / 2
)
)

## relative frequency
df["Relative Frequency"] = df["Frequency"] / df["Frequency"].sum()

## cumulative frequency
df["Cumulative Frequency"] = df["Frequency"].cumsum()

return df

## data
test_scores = np.array([
52, 92, 84, 74, 65, 55, 78, 95, 62,
72, 64, 74, 82, 94, 71, 79, 73, 94,
77, 53, 77, 87, 97, 57, 72, 89, 76,
91, 86, 99, 71, 73, 58, 76, 33, 78, 69
])

## without specifying user input


make_frequency_distribution(test_scores)
Start value: 33
End value: 99
Range: 66
Total Number of Classes: 6
Class Width = 11.0

Class Frequenc Class Relative Cumulative


Intervals y Midpoint Frequency Frequency

33.0 - under
0 1 38.5 0.027027 1
44.0

44.0 - under
1 2 49.5 0.054054 3
55.0

55.0 - under
2 6 60.5 0.162162 9
66.0

66.0 - under
3 11 71.5 0.297297 20
77.0

77.0 - under
4 9 82.5 0.243243 29
88.0

88.0 - under
5 8 93.5 0.216216 37
99.0

HYPOTHESIS TESING
PROBABILITY DISTRIBUTION
REGRESSION

import numpy as np
import matplotlib.pyplot as plt

def generate_data(size=100, seed=None):


if seed is not None:
np.random.seed(seed)
independent_variable = np.random.rand(size) * 30 # Independent
variable with variability
noise = np.random.randn(size) * 5 # Noise for variability in the
dependent variable
dependent_variable = 2 * independent_variable + 10 + noise #
Dependent variable with a linear relationship
return independent_variable, dependent_variable
def calculate_regression_coefficients(x, y):
x_mean, y_mean = np.mean(x), np.mean(y)
numerator = np.sum((x - x_mean) * (y - y_mean))
denominator = np.sum((x - x_mean)**2)
slope = numerator / denominator
intercept = y_mean - slope * x_mean
return slope, intercept
def create_regression_line(x, slope, intercept):
return slope * x + intercept

def visualize_data_and_regression(x, y, slope, intercept):


plt.scatter(x, y, alpha=0.7, label='Data Points')
regression_line = create_regression_line(x, slope, intercept)
plt.plot(x, regression_line, color='red', label='Regression Line')
plt.title('Scatterplot with Regression Line')
plt.xlabel('Independent Variable')
plt.ylabel('Dependent Variable')
plt.legend()
#plt.grid(True)
plt.show()
def predict(slope, intercept, x):
return slope * x + intercept
def evaluate_model(actual, predicted):
mse = np.mean((actual - predicted)**2)
r_squared = 1 - (np.sum((actual - predicted)**2) / np.sum((actual -
np.mean(actual))**2))
return mse, r_squared
def draw_conclusions(slope, intercept, mse, r_squared):
print(f"Slope: {slope}")
print(f"Intercept: {intercept}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared: {r_squared}")
# Main Program
independent_variable, dependent_variable = generate_data(seed=42)
slope, intercept = calculate_regression_coefficients(independent_variable,
dependent_variable)

visualize_data_and_regression(independent_variable, dependent_variable,
slope, intercept)

predictions = predict(slope, intercept, independent_variable)


mse, r_squared = evaluate_model(dependent_variable, predictions)

draw_conclusions(slope, intercept, mse, r_squared)


CORRELATION

import numpy as np
import matplotlib.pyplot as plt

# Generate Data
def generate_data(size=100, seed=None):
if seed is not None:
np.random.seed(seed)
variable1 = np.random.rand(size) * 30 # First variable with
variability
variable2 = variable1 + np.random.randn(size) * 10 # Second variable
with variability and a linear relationship
return variable1, variable2
# Calculate Correlation Coefficient
def calculate_correlation_coefficient(x, y):
covariance_matrix = np.cov(x, y)
correlation_coefficient = covariance_matrix[0, 1] / (np.std(x) *
np.std(y))
return correlation_coefficient
# Create Scatter plot

def create_scatter_plot(x, y):


plt.scatter(x, y, alpha=0.7)
plt.title('Scatterplot of Variable 1 vs Variable 2')
plt.xlabel('Variable 1')
plt.ylabel('Variable 2')
plt.grid(True)
plt.show()
# Interpret Scatter plot
def interpret_correlation_coefficient(correlation_coefficient):
if correlation_coefficient> 0:
correlation_type = "positive"
elif correlation_coefficient< 0:
correlation_type = "negative"
else:
correlation_type = "no apparent"
return correlation_type
# Experiment
seed = 42 # Change the seed to observe different scenarios
variable1, variable2 = generate_data(seed=seed)
correlation_coefficient = calculate_correlation_coefficient(variable1,
variable2)
create_scatter_plot(variable1,variable2)
print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
correlation_type =
interpret_correlation_coefficient(correlation_coefficient)
print(f"The scatterplot shows a {correlation_type} correlation.")
ONE WAY ANOVA- COMPLETELY RANDOMIZED DESIGN
import pandas as pd
import numpy as np
import math
from scipy import stats
import scipy
import statsmodels.api as sm
from statsmodels.formula.api import ols
from matplotlib import pyplot as plt

data=pd.read_excel('/content/tm.xlsx')
data

You might also like