6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [24]:
!pip install scikit-learn
Requirement already satisfied: scikit-learn in c:\python39\lib\site-packages (1.2.2)
Requirement already satisfied: numpy>=1.17.3 in c:\python39\lib\site-packages (from scikit-lear
n) (1.24.1)
Requirement already satisfied: scipy>=1.3.2 in c:\python39\lib\site-packages (from scikit-lear
n) (1.10.1)
Requirement already satisfied: joblib>=1.1.1 in c:\python39\lib\site-packages (from scikit-lear
n) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\python39\lib\site-packages (from scik
it-learn) (3.1.0)
In [2]:
#IMPORT THE REQUIRED LIBRARIES
In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler as SS
In [8]:
#LOAD THE DATASET USING PANDAS LIBRARY
In [9]:
dataset=pd.read_csv('creditcard.csv')
In [10]:
#VIEWING THE DATASET USING head() and tail()
In [11]:
dataset.head()
Out[11]:
V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V2
55 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.18911
54 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.12589
80 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.13909
91 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.22192
34 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.50229
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 1/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [12]:
dataset.tail()
Out[12]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ...
284802 172786.0 -11.881118 10.071785 -9.834783 -2.066656 -5.364473 -2.606837 -4.918215 7.305334 1.914428 ...
284803 172787.0 -0.732789 -0.055080 2.035030 -0.738589 0.868229 1.058415 0.024330 0.294869 0.584800 ...
284804 172788.0 1.919565 -0.301254 -3.249640 -0.557828 2.630515 3.031260 -0.296827 0.708417 0.432454 ...
284805 172788.0 -0.240440 0.530483 0.702510 0.689799 -0.377961 0.623708 -0.686180 0.679145 0.392087 ...
284806 172792.0 -0.533413 -0.189733 0.703337 -0.506271 -0.012546 -0.649617 1.577006 -0.414650 0.486180 ...
5 rows × 31 columns
In [13]:
#VIEW THE SHAPE OF THE DATASET
In [14]:
dataset.shape
Out[14]:
(284807, 31)
In [15]:
#CHECKING FOR NULL VALUES IN THE DATASET
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 2/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [35]:
dataset.isnull().sum()
Out[35]:
Time 0
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
Amount 0
Class 0
dtype: int64
In [16]:
#INFORMATION ABOUT DATASET FEATURES
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 3/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [19]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Time 284807 non-null float64
1 V1 284807 non-null float64
2 V2 284807 non-null float64
3 V3 284807 non-null float64
4 V4 284807 non-null float64
5 V5 284807 non-null float64
6 V6 284807 non-null float64
7 V7 284807 non-null float64
8 V8 284807 non-null float64
9 V9 284807 non-null float64
10 V10 284807 non-null float64
11 V11 284807 non-null float64
12 V12 284807 non-null float64
13 V13 284807 non-null float64
14 V14 284807 non-null float64
15 V15 284807 non-null float64
16 V16 284807 non-null float64
17 V17 284807 non-null float64
18 V18 284807 non-null float64
19 V19 284807 non-null float64
20 V20 284807 non-null float64
21 V21 284807 non-null float64
22 V22 284807 non-null float64
23 V23 284807 non-null float64
24 V24 284807 non-null float64
25 V25 284807 non-null float64
26 V26 284807 non-null float64
27 V27 284807 non-null float64
28 V28 284807 non-null float64
29 Amount 284807 non-null float64
30 Class 284807 non-null int64
dtypes: float64(30), int64(1)
memory usage: 67.4 MB
In [ ]:
#STATISTICAL MEASURES OF THE DATASET FEATURES
In [20]:
dataset.describe()
Out[20]:
Time V1 V2 V3 V4 V5 V6
count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.8480
mean 94813.859575 1.168375e-15 3.416908e-16 -1.379537e-15 2.074095e-15 9.604066e-16 1.487313e-15 -5.556
std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 1.380247e+00 1.332271e+00 1.2370
min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 -1.137433e+02 -2.616051e+01 -4.3557
25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 -6.915971e-01 -7.682956e-01 -5.540
50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 -5.433583e-02 -2.741871e-01 4.010
75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 6.119264e-01 3.985649e-01 5.704
max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 3.480167e+01 7.330163e+01 1.2058
8 rows × 31 columns
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 4/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [21]:
#COUNTING THE TARGET VALUES OF VALID AND FRAUD TRANSACTIONS
In [22]:
dataset['Class'].value_counts()
Out[22]:
0 284315
1 492
Name: Class, dtype: int64
In [23]:
#SEPERATING VALID AND FRAUD TRANSACTIONS FOR ANALYSIS
In [24]:
real=dataset[dataset.Class==0]
fraud=dataset[dataset.Class==1]
In [25]:
#VIEW THE SHAPES OF VALID AND FRAUD TRANSACTIONS DATA
In [27]:
print(real.shape,fraud.shape)
(284315, 31) (492, 31)
In [ ]:
#STATISTICAL MEASURES ON VALID AND FRAUD DATA - AMOUNT COLUMN
In [28]:
real.Amount.describe()
Out[28]:
count 284315.000000
mean 88.291022
std 250.105092
min 0.000000
25% 5.650000
50% 22.000000
75% 77.050000
max 25691.160000
Name: Amount, dtype: float64
In [29]:
fraud.Amount.describe()
Out[29]:
count 492.000000
mean 122.211321
std 256.683288
min 0.000000
25% 1.000000
50% 9.250000
75% 105.890000
max 2125.870000
Name: Amount, dtype: float64
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 5/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [30]:
#CONVERTING THE UNBALANCED DATASET TO EQUAL SIZE
In [31]:
real_trans=real.sample(n=492)
In [32]:
#SHAPE OF CONVERTED VALID TRANSACTIONS DATA
In [42]:
print(real_trans.shape)
(492, 31)
In [33]:
#CONCATENATING THE NEW VALID TRANSACTIONS DATA AND FRAUD DATA TO FORM NEW DATASET
In [34]:
new_dataset=pd.concat([real_trans,fraud],axis=0)
In [35]:
new_dataset.shape
Out[35]:
(984, 31)
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 6/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [45]:
print(new_dataset)
Time V1 V2 V3 V4 V5 V6 \
175010 122163.0 -0.417411 0.160325 0.603045 -2.121531 0.351510 -0.465216
169964 119951.0 0.138774 0.905687 -2.094182 0.130775 3.505227 3.309589
50495 44516.0 -1.808419 -1.330914 1.886407 0.880290 1.535955 -1.565265
180946 124758.0 1.202082 -1.952323 -3.845221 -1.479496 0.658750 -0.957501
191993 129489.0 -0.237305 -0.187512 0.749235 -1.503247 0.127986 -0.541237
... ... ... ... ... ... ... ...
279863 169142.0 -1.927883 1.125653 -4.518331 1.749293 -1.566487 -2.010494
280143 169347.0 1.378559 1.289381 -5.004247 1.411850 0.442581 -1.326536
280149 169351.0 -0.676143 1.126366 -2.213700 0.468308 -1.120541 -0.003346
281144 169966.0 -3.113832 0.585864 -5.399730 1.817092 -0.840618 -2.943548
281674 170348.0 1.991976 0.158476 -2.583441 0.408670 1.151147 -0.096695
V7 V8 V9 ... V21 V22 V23 \
175010 0.117327 0.026004 -0.864069 ... 0.299839 0.757664 -0.264823
169964 0.629652 0.652173 -1.248424 ... 0.146853 0.526173 -0.288015
50495 -1.188445 0.097411 0.761794 ... 0.073893 0.519721 -0.106133
180946 1.595032 -0.676322 0.734134 ... 0.590924 0.752840 -0.898634
191993 0.112686 -0.033567 -0.825444 ... -0.295908 -0.422410 -0.046887
... ... ... ... ... ... ... ...
279863 -0.882850 0.697211 -2.064945 ... 0.778584 -0.319189 0.639419
280143 -1.413170 0.248525 -1.127396 ... 0.370612 0.028234 -0.145640
280149 -2.234739 1.210158 -0.652250 ... 0.751826 0.834108 0.190944
281144 -2.208002 1.058733 -1.632333 ... 0.583276 -0.269209 -0.456108
281674 0.223050 -0.068384 0.577829 ... -0.164350 -0.295135 -0.072173
V24 V25 V26 V27 V28 Amount Class
175010 0.372770 0.064349 -0.201284 0.122504 0.176890 29.00 0
169964 0.709037 -0.090712 1.137600 0.147682 0.175931 18.35 0
50495 0.439863 -1.101343 -0.752362 0.379485 0.226426 9.99 0
180946 -1.299067 0.794971 0.225243 -0.184362 -0.021596 547.17 0
191993 -0.683232 -0.603309 0.525136 0.121051 0.172332 15.68 0
... ... ... ... ... ... ... ...
279863 -0.294885 0.537503 0.788395 0.292680 0.147968 390.00 1
280143 -0.081049 0.521875 0.739467 0.389152 0.186637 0.76 1
280149 0.032070 -0.739695 0.471111 0.385107 0.194361 77.89 1
281144 -0.183659 -0.328168 0.606116 0.884876 -0.253700 245.00 1
281674 -0.450261 0.313267 -0.289617 0.002988 -0.015309 42.53 1
[984 rows x 31 columns]
In [36]:
#DIVIDING THE DATA INTO DEPENDENT AND INDEPENDENT VARIABLES
In [37]:
X=new_dataset.iloc[:,:-1]
y=new_dataset['Class']
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 7/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [38]:
print(X)
print(y)
Time V1 V2 V3 V4 V5 V6 \
263860 161146.0 -4.412310 2.156901 -2.183607 -1.832658 1.023210 -0.591397
249255 154316.0 -0.204154 0.910504 0.734187 -0.332162 0.466750 -0.986063
187770 127683.0 -1.154785 0.819386 1.497317 -0.554252 -0.387083 0.308089
106821 70142.0 -1.769494 0.864547 1.015048 -1.010349 -0.687608 -1.248479
205828 135929.0 -1.325690 1.369450 -0.801471 -0.269350 0.006770 -0.888574
... ... ... ... ... ... ... ...
279863 169142.0 -1.927883 1.125653 -4.518331 1.749293 -1.566487 -2.010494
280143 169347.0 1.378559 1.289381 -5.004247 1.411850 0.442581 -1.326536
280149 169351.0 -0.676143 1.126366 -2.213700 0.468308 -1.120541 -0.003346
281144 169966.0 -3.113832 0.585864 -5.399730 1.817092 -0.840618 -2.943548
281674 170348.0 1.991976 0.158476 -2.583441 0.408670 1.151147 -0.096695
V7 V8 V9 ... V20 V21 V22 \
263860 2.158228 -0.801932 2.032907 ... 0.375634 -0.420755 0.632763
249255 1.097691 -0.222299 -0.154412 ... -0.126150 -0.172224 -0.422991
187770 0.499359 0.542985 -0.628085 ... 0.202487 0.019352 -0.237784
106821 -0.022216 0.844541 -0.189338 ... -0.533993 -0.041783 -0.741824
205828 0.194742 0.871417 -0.286819 ... -0.278280 0.168760 0.479242
... ... ... ... ... ... ... ...
279863 -0.882850 0.697211 -2.064945 ... 1.252967 0.778584 -0.319189
280143 -1.413170 0.248525 -1.127396 ... 0.226138 0.370612 0.028234
280149 -2.234739 1.210158 -0.652250 ... 0.247968 0.751826 0.834108
281144 -2.208002 1.058733 -1.632333 ... 0.306271 0.583276 -0.269209
281674 0.223050 -0.068384 0.577829 ... -0.017652 -0.164350 -0.295135
V23 V24 V25 V26 V27 V28 Amount
263860 -0.268117 0.260142 0.838517 0.019231 -1.027425 0.277112 65.00
249255 -0.088608 0.008278 -0.309919 -0.543578 0.147403 0.160488 4.99
187770 -0.010109 -0.416879 0.388135 -0.408190 -0.083486 0.009354 115.84
106821 0.022315 0.325420 -0.305823 -0.101252 -0.338723 -0.090417 34.00
205828 -0.072601 -0.081796 -0.222106 0.560114 0.030049 0.046617 19.99
... ... ... ... ... ... ... ...
279863 0.639419 -0.294885 0.537503 0.788395 0.292680 0.147968 390.00
280143 -0.145640 -0.081049 0.521875 0.739467 0.389152 0.186637 0.76
280149 0.190944 0.032070 -0.739695 0.471111 0.385107 0.194361 77.89
281144 -0.456108 -0.183659 -0.328168 0.606116 0.884876 -0.253700 245.00
281674 -0.072173 -0.450261 0.313267 -0.289617 0.002988 -0.015309 42.53
[984 rows x 30 columns]
263860 0
249255 0
187770 0
106821 0
205828 0
..
279863 1
280143 1
280149 1
281144 1
281674 1
Name: Class, Length: 984, dtype: int64
In [ ]:
#DATA STANDARDIZATION USING STABDARDSCALER LIBRARY
In [39]:
#data standardization - downscaling the values in range between -1 to 1
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 8/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [44]:
scaler=SS()
scaler.fit(X)
standard_X=scaler.fit_transform(X)
In [93]:
print(standard_X)
[[ 0.73322128 0.34335831 -0.42202697 ... 0.02103477 0.3256881
-0.30057369]
[ 0.68729364 0.44310778 -0.22495517 ... 0.04602693 0.3234414
-0.3384124 ]
[-0.87895937 0.09388704 -0.81630711 ... 0.27611502 0.44169748
-0.3681149 ]
...
[ 1.71298328 0.29695591 -0.16660809 ... 0.28169634 0.36660485
-0.1268709 ]
[ 1.7257525 -0.14023319 -0.30951551 ... 0.7777682 -0.68270955
0.46685937]
[ 1.73368394 0.77547162 -0.42251596 ... -0.09759741 -0.1244213
-0.25250253]]
In [45]:
#SPLITTING THE DATASET INTO TRAINING AND TESTING
In [46]:
x_train,x_test,y_train,y_test=train_test_split(standard_X,y,test_size=0.1,stratify=y,random_state=42)
In [92]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
(885, 30) (99, 30) (885,) (99,)
In [ ]:
#MODEL TRAINING - LOGISTIC REGRESSION
In [47]:
classifier=LogisticRegression()
classifier.fit(x_train,y_train)
Out[47]:
▾ LogisticRegression
LogisticRegression()
In [48]:
#CHECKING THE ACCURACY ON TRAINING DATA
In [49]:
train_acc=classifier.predict(x_train)
train_acc_score=accuracy_score(train_acc,y_train)
print("The Accuracy on training data is :",train_acc_score)
The Accuracy on training data is : 0.9468926553672317
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 9/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [50]:
#PREDICT THE OUTPUTS USING TEST DATA ON THE MODEL
In [51]:
y_pred=classifier.predict(x_test)
In [52]:
print(y_pred)
[1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 0 0 1
0 0 1 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0
0 0 1 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1]
In [97]:
print(y_test)
198868 1
100928 0
163310 0
6707 0
269672 0
..
259266 0
30100 1
123238 1
127259 0
262560 1
Name: Class, Length: 99, dtype: int64
In [53]:
#CHECKING THE ACCURACY OF PREDICTED OUTPUTS BY THE MODEL
In [55]:
accuracy=accuracy_score(y_pred,y_test)
print("The Accuracy Score of the model is : ",accuracy)
The Accuracy Score of the model is : 0.9393939393939394
In [56]:
#LET US PREDICT THE LABLES OF UNKNOWN DATA IN THE DATASET
In [57]:
input_data=[166205.0,-1.359807134, -0.072781173,2.536346738,1.378155224,-0.33832077,
0.462387778,0.239598554,0.098697901,0.3637869,0.090794172,-0.551599533,
-0.617800856,-0.991389847,-0.311169354,1.468176972,-0.470400525,
0.207971242,0.02579058,0.40399296,0.251412098,-0.018306778,0.277837576,
-0.11047391,0.066928075,0.128539358,-0.189114844,0.133558377,-0.21053053,149.62]
In [58]:
#CHANGING THIS INPUT DATA INTO NUMPY ARRAY
In [60]:
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 10/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [61]:
#STANDARDIZE THE RESHAPED ARRAY DATA
In [63]:
stand_input_data=scaler.transform(input_data_reshaped)
print(stand_input_data)
[[ 1.59794256 0.18874539 -0.52107157 0.98176529 -0.2739496 0.28883819
0.66683349 0.51291361 -0.04356215 0.7037809 0.65072108 -0.88932951
0.53775104 -0.89727889 0.68345496 1.53137371 0.45734066 0.58741521
0.47587055 0.03307182 0.04601447 -0.14486552 0.22806328 -0.07255594
0.17773751 0.14912712 -0.46094652 0.06586425 -0.54428761 0.17595024]]
C:\python39\lib\site-packages\sklearn\base.py:439: UserWarning: X does not have valid feature n
ames, but StandardScaler was fitted with feature names
warnings.warn(
In [64]:
#PREDICT THE USER OF THIS UNKNOWN DATA
In [69]:
prediction_label=classifier.predict(stand_input_data)
In [71]:
if prediction_label:
print("Fraud Transaction..............👎")
else:
print("Valid Transaction...............👍")
Valid Transaction...............👍
In [73]:
#-----------LET US FIT THE RandomForestClassifier ON THE SAME DATA AND CHECK THE ACCURACY AND RESLUT----------
In [77]:
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
pred_rfc=rfc.predict(x_test)
print("Predicted lables using RFC\n",pred_rfc)
Predicted lables using RFC
[1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 0 0 1
0 0 1 1 0 1 0 0 1 0 1 0 1 1 1 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0
0 0 1 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1]
In [78]:
acc_score_rfc=accuracy_score(pred_rfc,y_test)
print("Accuracy Score using RFC is :",acc_score_rfc)
Accuracy Score using RFC is : 0.9292929292929293
In [79]:
prediction_label_rfc=rfc.predict(stand_input_data)
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 11/12
6/16/23, 8:41 PM Credit_Card_fraud_detection using ML - Jupyter Notebook
In [80]:
if prediction_label_rfc:
print("Fraud Transaction..............👎")
else:
print("Valid Transaction...............👍")
Valid Transaction...............👍
In [ ]:
localhost:8888/notebooks/Credit_Card_fraud_detection using ML.ipynb# 12/12