Skip to content
Snippets Groups Projects
Commit eed5e315 authored by Konrad Firley's avatar Konrad Firley
Browse files

Delete notebook_2.ipynb -> Not needed

parent 62e255fa
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:58fa7892 tags:
# 4. Modellierung
%% Cell type:markdown id:409bf0ce tags:
## 4.1 Import von relevanten Modulen
%% Cell type:code id:ce52edf1 tags:
``` python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
sns.set()
%matplotlib inline
```
%% Cell type:markdown id:2e9db144 tags:
## 4.2 Daten einlesen
%% Cell type:code id:83961ee0 tags:
``` python
data = pd.read_csv('dataset_dummies.csv') # file is generated in notebook_1
```
%% Cell type:code id:72081129 tags:
``` python
data.head()
```
%% Output
policy_csl_250/500 policy_csl_500/1000 insured_sex_MALE \
0 1 0 1
1 1 0 1
2 0 0 0
3 1 0 0
4 0 1 1
insured_education_level_College insured_education_level_High School \
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0
insured_education_level_JD insured_education_level_MD \
0 0 1
1 0 1
2 0 0
3 0 0
4 0 0
insured_education_level_Masters insured_education_level_PhD \
0 0 0
1 0 0
2 0 1
3 0 1
4 0 0
insured_occupation_armed-forces ... capital-gains capital-loss \
0 0 ... 53300 0
1 0 ... 0 0
2 0 ... 35100 0
3 1 ... 48900 -62400
4 0 ... 66000 -46000
number_of_vehicles_involved bodily_injuries witnesses injury_claim \
0 1 1 2 6510
1 1 0 0 780
2 3 2 3 7700
3 1 1 2 6340
4 1 0 1 1300
property_claim vehicle_claim fraud_reported pct_paid_insurance
0 13020 52080 1 0.986035
1 780 3510 1 0.605523
2 3850 23100 0 0.942280
3 6340 50720 1 0.968454
4 650 4550 0 0.846154
[5 rows x 74 columns]
%% Cell type:markdown id:c139e7ce tags:
## 4.3 Datenvorbereitung für die Modellierung
%% Cell type:code id:a5c7e329 tags:
``` python
target = data.fraud_reported
features = data.drop('fraud_reported', axis=1)
```
%% Cell type:code id:bf93a421 tags:
``` python
# Split data in training and test datasets
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=365)
```
%% Cell type:code id:92201392 tags:
``` python
x_train.head()
```
%% Output
policy_csl_250/500 policy_csl_500/1000 insured_sex_MALE \
908 1 0 1
591 0 1 0
836 0 0 0
145 0 0 0
606 0 1 0
insured_education_level_College insured_education_level_High School \
908 0 0
591 0 0
836 0 0
145 0 0
606 0 0
insured_education_level_JD insured_education_level_MD \
908 0 1
591 0 0
836 1 0
145 0 0
606 0 0
insured_education_level_Masters insured_education_level_PhD \
908 0 0
591 0 0
836 0 0
145 0 0
606 0 0
insured_occupation_armed-forces ... umbrella_limit capital-gains \
908 0 ... 0 52600
591 1 ... 0 0
836 0 ... 0 52100
145 0 ... 0 0
606 0 ... 0 0
capital-loss number_of_vehicles_involved bodily_injuries witnesses \
908 0 1 1 0
591 0 1 2 1
836 0 1 0 1
145 -57900 1 2 1
606 -66200 1 0 3
injury_claim property_claim vehicle_claim pct_paid_insurance
908 500 500 4500 0.636364
591 7270 21810 50890 0.993748
836 21330 7110 56880 0.988279
145 7640 15280 76400 0.994966
606 5750 5750 46000 0.982609
[5 rows x 73 columns]
%% Cell type:code id:20040790 tags:
``` python
# Scale data
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
```
%% Cell type:code id:64342d4d tags:
``` python
# distribution of target in train data
y_train.value_counts()
```
%% Output
0 596
1 204
Name: fraud_reported, dtype: int64
%% Cell type:code id:5ee57584 tags:
``` python
# distribution of target in test datat
y_test.value_counts()
```
%% Output
0 157
1 43
Name: fraud_reported, dtype: int64
%% Cell type:markdown id:73297e79 tags:
# 4.4 Modellierung und Evaluation
%% Cell type:markdown id:c9b07ea0 tags:
### 4.4.1 Logistische Regression
%% Cell type:code id:6b330394 tags:
``` python
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
```
%% Output
LogisticRegression()
%% Cell type:code id:fa3ef374 tags:
``` python
# train data
print(classification_report(y_train, logreg.predict(x_train)))
```
%% Output
precision recall f1-score support
0 0.92 0.93 0.92 596
1 0.79 0.75 0.77 204
accuracy 0.89 800
macro avg 0.85 0.84 0.85 800
weighted avg 0.88 0.89 0.88 800
%% Cell type:code id:86ea265b tags:
``` python
# train data
print('Accuracy:', accuracy_score(y_train, logreg.predict(x_train))*100)
print('Precision:', precision_score(y_train, logreg.predict(x_train))*100)
print('Recall:', recall_score(y_train, logreg.predict(x_train))*100)
```
%% Output
Accuracy: 88.5
Precision: 78.8659793814433
Recall: 75.0
%% Cell type:code id:172d0b95 tags:
``` python
# test data
print(classification_report(y_test, logreg.predict(x_test)))
```
%% Output
precision recall f1-score support
0 0.91 0.91 0.91 157
1 0.67 0.65 0.66 43
accuracy 0.85 200
macro avg 0.79 0.78 0.78 200
weighted avg 0.85 0.85 0.85 200
%% Cell type:code id:8a1e6fd9 tags:
``` python
# test data
print('Accuracy:', accuracy_score(y_test, logreg.predict(x_test))*100)
print('Precision:', precision_score(y_test, logreg.predict(x_test))*100)
print('Recall:', recall_score(y_test, logreg.predict(x_test))*100)
```
%% Output
Accuracy: 85.5
Precision: 66.66666666666666
Recall: 65.11627906976744
%% Cell type:code id:c48c8e65 tags:
``` python
tn, fp, fn, tp = confusion_matrix(y_test, logreg.predict(x_test)).ravel()
print(tn, fp, fn, tp)
```
%% Output
143 14 15 28
%% Cell type:code id:584079bf tags:
``` python
cm = confusion_matrix(y_test, logreg.predict(x_test))
sns.heatmap(cm, annot=True, cmap='terrain', fmt='g')
plt.xlabel('Real data')
plt.ylabel('Predicted data')
plt.show()
```
%% Output
%% Cell type:code id:f96ea8be tags:
``` python
logreg.intercept_
```
%% Output
array([-1.87199801])
%% Cell type:code id:2950b467 tags:
``` python
logreg.coef_
```
%% Output
array([[ 5.80290586e-02, -2.37429629e-01, -3.18975936e-02,
1.03264901e-01, 1.57395846e-02, 1.54412249e-01,
9.98550038e-02, 1.01734974e-01, 1.65847423e-01,
7.04430787e-02, 2.21983720e-01, 2.75034627e-01,
6.08030524e-02, -8.57181314e-02, 6.39780307e-02,
-1.53624306e-01, -4.61802683e-02, 1.51000581e-01,
-5.51566389e-02, 5.90062519e-02, -8.83889618e-03,
-8.20083809e-03, -1.42216723e-01, 1.90073855e-02,
-2.16204990e-01, -4.62653155e-01, 7.95035036e-01,
5.66640168e-01, -3.05200290e-01, -2.31887634e-01,
-8.43891138e-02, -9.43983125e-02, -3.03973994e-01,
-9.70096772e-02, -2.00177795e-01, -5.26322421e-02,
-1.90789105e-02, -1.19299001e-01, -2.22788639e-01,
8.18201612e-03, 2.03631238e-01, 3.87350193e-01,
4.07693675e-01, -2.79170918e-02, 2.25397475e-01,
1.85538456e-01, -1.95996214e-01, -1.87436352e-01,
-2.87031385e-01, 1.41595895e-01, -1.11191352e-01,
-1.74200227e+00, -1.66185713e+00, -1.21492255e+00,
1.24360304e-01, 1.64944689e-01, 2.58711282e-01,
3.23758889e-01, 1.30618426e-03, -1.22778321e-01,
-1.01884582e-02, 5.54932851e-02, 5.82757827e-02,
3.61968828e-01, -1.20848644e-01, -2.42543595e-01,
-1.93472781e-01, 8.04217694e-02, 5.86457472e-02,
-1.71694568e-01, -3.64878309e-02, 1.53029355e-01,
3.50253316e-03]])
%% Cell type:markdown id:5c21a230 tags:
### 4.4.2 Entscheidungsbaum
%% Cell type:code id:3c2caa28 tags:
``` python
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)
```
%% Output
DecisionTreeClassifier()
%% Cell type:code id:d5102a31 tags:
``` python
# train data
print(classification_report(y_train, tree.predict(x_train)))
```
%% Output
precision recall f1-score support
0 1.00 1.00 1.00 596
1 1.00 1.00 1.00 204
accuracy 1.00 800
macro avg 1.00 1.00 1.00 800
weighted avg 1.00 1.00 1.00 800
%% Cell type:code id:dc4b1761 tags:
``` python
# train data
print('Accuracy:', accuracy_score(y_train, tree.predict(x_train))*100)
print('Precision:', precision_score(y_train, tree.predict(x_train))*100)
print('Recall:', recall_score(y_train, tree.predict(x_train))*100)
```
%% Output
Accuracy: 100.0
Precision: 100.0
Recall: 100.0
%% Cell type:code id:fff555ef tags:
``` python
# test data
print(classification_report(y_test, tree.predict(x_test)))
```
%% Output
precision recall f1-score support
0 0.83 0.85 0.84 157
1 0.39 0.35 0.37 43
accuracy 0.74 200
macro avg 0.61 0.60 0.61 200
weighted avg 0.73 0.74 0.74 200
%% Cell type:code id:afdad593 tags:
``` python
# test data
print('Accuracy:', accuracy_score(y_test, tree.predict(x_test))*100)
print('Precision:', precision_score(y_test, tree.predict(x_test))*100)
print('Recall:', recall_score(y_test, tree.predict(x_test))*100)
```
%% Output
Accuracy: 74.5
Precision: 39.473684210526315
Recall: 34.883720930232556
%% Cell type:code id:44062e93 tags:
``` python
cm = confusion_matrix(y_test, tree.predict(x_test))
sns.heatmap(cm, annot=True, cmap='terrain', fmt='g')
plt.xlabel('Real data')
plt.ylabel('Predicted data')
plt.show()
```
%% Output
%% Cell type:markdown id:e867e082 tags:
### 4.4.3 Random Forest
%% Cell type:code id:fbd0cd13 tags:
``` python
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
```
%% Output
RandomForestClassifier()
%% Cell type:code id:37a0c75e tags:
``` python
# train data
print(classification_report(y_train, forest.predict(x_train)))
```
%% Output
precision recall f1-score support
0 1.00 1.00 1.00 596
1 1.00 1.00 1.00 204
accuracy 1.00 800
macro avg 1.00 1.00 1.00 800
weighted avg 1.00 1.00 1.00 800
%% Cell type:code id:4192d83b tags:
``` python
# train data
print('Accuracy:', accuracy_score(y_train, forest.predict(x_train))*100)
print('Precision:', precision_score(y_train, forest.predict(x_train))*100)
print('Recall:', recall_score(y_train, forest.predict(x_train))*100)
```
%% Output
Accuracy: 100.0
Precision: 100.0
Recall: 100.0
%% Cell type:code id:1461c7a0 tags:
``` python
# test data
print(classification_report(y_test, forest.predict(x_test)))
```
%% Output
precision recall f1-score support
0 0.79 0.94 0.85 157
1 0.23 0.07 0.11 43
accuracy 0.75 200
macro avg 0.51 0.50 0.48 200
weighted avg 0.67 0.75 0.69 200
%% Cell type:code id:04b758a6 tags:
``` python
# test data
print('Accuracy:', accuracy_score(y_test, forest.predict(x_test))*100)
print('Precision:', precision_score(y_test, forest.predict(x_test))*100)
print('Recall:', recall_score(y_test, forest.predict(x_test))*100)
```
%% Output
Accuracy: 75.0
Precision: 23.076923076923077
Recall: 6.976744186046512
%% Cell type:code id:1c713122 tags:
``` python
cm = confusion_matrix(y_test, forest.predict(x_test))
sns.heatmap(cm, annot=True, cmap='terrain', fmt='g')
plt.xlabel('Real data')
plt.ylabel('Predicted data')
plt.show()
```
%% Output
%% Cell type:markdown id:d4e8e96f tags:
### 4.4.4 Support Vector Machine
%% Cell type:code id:4bd111b7 tags:
``` python
svc = SVC()
svc.fit(x_train, y_train)
```
%% Output
SVC()
%% Cell type:code id:57b9a951 tags:
``` python
# train data
print(classification_report(y_train, svc.predict(x_train)))
```
%% Output
precision recall f1-score support
0 0.92 0.99 0.95 596
1 0.95 0.75 0.84 204
accuracy 0.93 800
macro avg 0.94 0.87 0.90 800
weighted avg 0.93 0.93 0.92 800
%% Cell type:code id:83962b23 tags:
``` python
# train data
print('Accuracy:', accuracy_score(y_train, svc.predict(x_train))*100)
print('Precision:', precision_score(y_train, svc.predict(x_train))*100)
print('Recall:', recall_score(y_train, svc.predict(x_train))*100)
```
%% Output
Accuracy: 92.625
Precision: 95.03105590062113
Recall: 75.0
%% Cell type:code id:ffe4ac08 tags:
``` python
# test data
print(classification_report(y_test, svc.predict(x_test)))
```
%% Output
precision recall f1-score support
0 0.85 0.99 0.91 157
1 0.89 0.37 0.52 43
accuracy 0.85 200
macro avg 0.87 0.68 0.72 200
weighted avg 0.86 0.85 0.83 200
%% Cell type:code id:99f76a05 tags:
``` python
# test data
print('Accuracy:', accuracy_score(y_test, svc.predict(x_test))*100)
print('Precision:', precision_score(y_test, svc.predict(x_test))*100)
print('Recall:', recall_score(y_test, svc.predict(x_test))*100)
```
%% Output
Accuracy: 85.5
Precision: 88.88888888888889
Recall: 37.2093023255814
%% Cell type:code id:f7ea05f4 tags:
``` python
cm = confusion_matrix(y_test, svc.predict(x_test))
sns.heatmap(cm, annot=True, cmap='terrain', fmt='g')
plt.xlabel('Real data')
plt.ylabel('Predicted data')
plt.show()
```
%% Output
%% Cell type:markdown id:d9d7678f tags:
# 5. Deployment
%% Cell type:code id:6c6ba1e6 tags:
``` python
# Select one scaled person of the dataset
sample_df = x_test[72]
```
%% Cell type:code id:6b73ff09 tags:
``` python
# Features of the selected sample
sample_df
```
%% Output
array([ 1.35903462, -0.65270587, 1.10833761, -0.37363236, -0.42828957,
2.30200187, -0.41181385, -0.40137644, -0.38655567, -0.27958383,
-0.30478874, -0.28730468, -0.24413654, -0.24124895, -0.31207962,
-0.26636529, -0.26636529, -0.30478874, -0.26636529, -0.28217394,
-0.28984624, 3.79270555, -0.18328047, -0.22941573, -0.24983394,
-0.24124895, -0.22331316, 5.06622805, -0.21707238, -0.24699789,
-0.23241869, -0.24413654, -0.23833416, -0.23833416, -0.23833416,
-0.22021079, -0.25264558, -0.22021079, -0.19044535, -0.2353911 ,
-0.24983394, 2.19986728, -0.47248449, -0.46255869, -0.40973554,
-0.43033148, -0.27958383, -0.82502865, -0.31926223, -0.91370804,
-0.6352234 , -0.74390729, 1.60356745, -0.29488391, -0.51752183,
-0.29738086, -0.50780078, -0.65660263, -0.6644106 , -0.67419986,
-1.6511054 , -1.04810348, 0.18475885, -0.48560679, -0.92537512,
0.963709 , 1.11630666, -1.18253256, 0.45167913, 0.85886085,
0.85043965, 0.74218584, 0.10204472])
%% Cell type:code id:05ec844f tags:
``` python
# Execute prediction
sample_pred = svc.predict([sample_df])
```
%% Cell type:code id:ac328f85 tags:
``` python
# Interpret the result
def check_prediction(pred):
if pred[0] == 1:
print("Fraud.")
else:
print("No Fraud.")
```
%% Cell type:code id:fc24f4ee tags:
``` python
# call the prediciton method
check_prediction(sample_pred)
```
%% Output
Fraud.
%% Cell type:code id:c91f2802 tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment