A DATA SCIENCE PROJECT
GLOBAL SUPER-STORE
AND
SALES DATA
TASL 2(1)
import pandas as pd
from google.colab import files
import matplotlib.pyplot as plt
import seaborn as sns
uploaded=files.upload()
data=pd.read_csv(list(uploaded.keys())[0], encoding='ISO-8859-1')
print("data")
print("data")
print(data.region())
print(data.product-category())
print(data.profit())
print(data.sales())
print("Any missing values")
print(data.isnull().sum())
print(data.notnull())
data=data.drop_duplicates()
print("Handling outliers")
print(Q1=data.quantile(0.25))
print(Q3=data.quantile(0.75))
IQR=Q3-Q1
print(IQR)
print("Descriptive Statistics")
print(data[['Sales','Profit']].describe())
print("Sales Variance:", data['Sales'].var())
print("Sales Standard Deviation:", data['Sales'].std())
print("Profit Variance:", data['Profit'].var())
print("Profit Standard Deviation:", data['Profit'].std())
print("correlation")
corr_matrix = data.corr(numeric_only=True)
print(corr_matrix[['Sales','Profit']])
sns.histplot(data['Sales'], kde=true)
plt.title("Sales Distribution")
plt.show()
sns.boxplot(x-data['Profit'])
plt.title("Profit Boxplot")
plt.show()
sns.heatmap(data.corr(numeric_only=true), annot=true, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()
SALES DATA
TASK 2(2)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
uploaded=files.upload()
df=pd.read_csv(list(uploaded.keys())[0], encoding='ISO-8859-1')
print("The data:")
display(df.head())
print("Dataset Information")
df.info()
print("Statistical Summary")
display(df.describe())
duplicates= df,duplicates().sum()
print(f"Number of duplicate rows:{duplicates}")
df= df.drop_duplicates()
print(f"Missing values before cleaning:"{df.isnull().sum()}")
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)
df['Region'] = df['Region'].fillna(fd['Region'].mode()[0])
df['Date']= df['Date'].fillna(df['Date'].mode()[0])
print(f"Missing values after cleaning:{df.isnull().sum()}")
df['Date']= pd.to_datetime(df['Date'], format= '%d-%m-%Y')
print("Data after cleaning:")
display(df.head())
plt.figure(figsize=(8,6))
sns.scatterplot(x='Discount', y='Profit', data =df, color= 'orange')
plt.title('Profit vs Discount')
plt.xlabel('Discount')
plt.ylabel('Profit')
plt.show()
plt.figure(figsize=(8,6))
region_sales= df.groupby('Region')['Sales'].sum()
region_sales.plot(kind='bar',color='green')
plt.title('Sales by region')
plt.ylabel('Total Sales')
plt.show()
plt.figure(figsize=(8,6))
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True,
cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
X=df[['Profit', 'Discount']]
Y= df['Sales']
X_train, X_test, Y_train, Y_test= train_test_split(X,Y,test_size=0.2,
random_state=42)
model= LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print(f"Mean Squared Error:{ mean_squared_error(Y_test, Y_pred):.3f}")
print(f"R-squared Score:{r2_score(Y_test, Y_pred):.2f}")