knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(knitr)
library(dplyr)
library(readxl)
library(lubridate)
library(tidyr)
library(openxlsx)
library(stringr)
library(plotly)
library(RColorBrewer)
library(rpart.plot)
library(rpart)
library(caret)
library(stats)
library(nnet)
library(dummies)
library(randomForest)
library(skimr)
library(ipred)
library(gbm)
library(FNN)
library(rmdformats)
library(aplpack)
library(psych)
library(tidyr)
library(corrplot)
library(tibble)
library(kableExtra)
library(VIM)
library(mice)
library(car)
library(GGally)
library(devtools)
library(paletteer)
colores= c("#CD6889","#528B8B", "#458B74","#838B8B","#53868B", "#8B475D", "#8B636C", "#8B8682", "#CD7054","#8B0000", "#E9967A", "#483D8B", "#2F4F4F","#F08080", "#A2B5CD", "#CDBE70", "#EEA2AD", "#B4CDCD")
BASE=read.csv("train_SJC.csv", stringsAsFactors = FALSE)
names(BASE)=c("ClaimNumber","DateTimeOfAccident", "DateReported", "Age", "Gender", "MaritalStatus" , "DependentChildren" ,"DependentsOther" , "WeeklyWages" ,"PartTimeFullTime" ,"HoursWorkedPerWeek", "DaysWorkedPerWeek", "ClaimDescription", "InitialIncurredCalimsCost", "UltimateIncurredClaimCost")
BASE=BASE[2:nrow(BASE),]
BASE = BASE %>% mutate(Age=as.numeric(Age),
DependentChildren=as.factor(as.numeric(DependentChildren)),
DependentsOther=as.factor(as.numeric(DependentsOther)),
DaysWorkedPerWeek=as.factor(as.numeric(DaysWorkedPerWeek)),
WeeklyWages=as.numeric(WeeklyWages),
HoursWorkedPerWeek=as.numeric(HoursWorkedPerWeek),
ClaimDescription=as.character(ClaimDescription),
InitialIncurredCalimsCost=as.numeric(InitialIncurredCalimsCost),
UltimateIncurredClaimCost=as.numeric(UltimateIncurredClaimCost),
DateTimeOfAccident=as.POSIXct(DateTimeOfAccident,
format="%Y-%m-%dT%H:%M:%S" ),
DateReported=as.POSIXct(DateReported, format="%Y-%m-%dT%H:%M:%S"),
days_Report=round(as.numeric(difftime(DateReported,DateTimeOfAccident,
units = "days")))) %>%
filter(days_Report>=0,
MaritalStatus != "",
Gender!="U")
summary(BASE)
## ClaimNumber DateTimeOfAccident DateReported
## Length:36146 Min. :1988-01-01 09:00:00 Min. :1988-01-10 00:00:00
## Class :character 1st Qu.:1992-07-03 12:15:00 1st Qu.:1992-08-10 00:00:00
## Mode :character Median :1996-12-28 07:00:00 Median :1997-02-10 00:00:00
## Mean :1996-12-30 15:58:37 Mean :1997-02-07 06:58:21
## 3rd Qu.:2001-07-05 07:15:00 3rd Qu.:2001-08-20 18:00:00
## Max. :2005-12-31 10:00:00 Max. :2006-09-23 00:00:00
##
## Age Gender MaritalStatus DependentChildren
## Min. :13.00 Length:36146 Length:36146 0 :33868
## 1st Qu.:23.00 Class :character Class :character 2 : 924
## Median :32.00 Mode :character Mode :character 1 : 858
## Mean :33.79 3 : 353
## 3rd Qu.:43.00 4 : 103
## Max. :79.00 5 : 34
## (Other): 6
## DependentsOther WeeklyWages PartTimeFullTime HoursWorkedPerWeek
## 0:35828 Min. : 1.0 Length:36146 Min. : 0.00
## 1: 297 1st Qu.: 200.0 Class :character 1st Qu.: 38.00
## 2: 15 Median : 393.3 Mode :character Median : 38.00
## 3: 6 Mean : 416.4 Mean : 37.77
## 3rd Qu.: 500.0 3rd Qu.: 40.00
## Max. :7497.0 Max. :640.00
## NA's :56 NA's :49
## DaysWorkedPerWeek ClaimDescription InitialIncurredCalimsCost
## 1: 119 Length:36146 Min. : 1
## 2: 336 Class :character 1st Qu.: 700
## 3: 956 Mode :character Median : 2000
## 4: 995 Mean : 7748
## 5:32969 3rd Qu.: 9500
## 6: 563 Max. :830000
## 7: 208
## UltimateIncurredClaimCost days_Report
## Min. : 122 Min. : 0.00
## 1st Qu.: 926 1st Qu.: 13.00
## Median : 3376 Median : 22.00
## Mean : 10956 Mean : 38.68
## 3rd Qu.: 8189 3rd Qu.: 40.00
## Max. :4027136 Max. :1088.00
##
str(BASE)
## 'data.frame': 36146 obs. of 16 variables:
## $ ClaimNumber : chr "WC8205482" "WC6922469" "WC5442654" "WC9796897" ...
## $ DateTimeOfAccident : POSIXct, format: "2002-04-09 07:00:00" "1999-01-07 11:00:00" ...
## $ DateReported : POSIXct, format: "2002-07-05" "1999-01-20" ...
## $ Age : num 48 43 30 41 36 50 39 56 49 30 ...
## $ Gender : chr "M" "F" "M" "M" ...
## $ MaritalStatus : chr "M" "M" "U" "S" ...
## $ DependentChildren : Factor w/ 9 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ DependentsOther : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ WeeklyWages : num 500 509 709 555 377 ...
## $ PartTimeFullTime : chr "F" "F" "F" "F" ...
## $ HoursWorkedPerWeek : num 38 37.5 38 38 38 38 38 40 38 37 ...
## $ DaysWorkedPerWeek : Factor w/ 7 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ ClaimDescription : chr "LIFTING TYRE INJURY TO RIGHT ARM AND WRIST INJURY" "STEPPED AROUND CRATES AND TRUCK TRAY FRACTURE LEFT FOREARM" "CUT ON SHARP EDGE CUT LEFT THUMB" "DIGGING LOWER BACK LOWER BACK STRAIN" ...
## $ InitialIncurredCalimsCost: num 1500 5500 1700 15000 2800 500 500 500 925 1500 ...
## $ UltimateIncurredClaimCost: num 4748 6326 2294 17786 4014 ...
## $ days_Report : num 87 13 20 29 29 80 10 38 32 15 ...
kbl(summary(BASE[,!sapply(BASE,is.character)])) %>%
kable_paper(bootstrap_options = "striped", full_width = F)
DateTimeOfAccident | DateReported | Age | DependentChildren | DependentsOther | WeeklyWages | HoursWorkedPerWeek | DaysWorkedPerWeek | InitialIncurredCalimsCost | UltimateIncurredClaimCost | days_Report | |
---|---|---|---|---|---|---|---|---|---|---|---|
Min. :1988-01-01 09:00:00 | Min. :1988-01-10 00:00:00 | Min. :13.00 | 0 :33868 | 0:35828 | Min. : 1.0 | Min. : 0.00 | 1: 119 | Min. : 1 | Min. : 122 | Min. : 0.00 | |
1st Qu.:1992-07-03 12:15:00 | 1st Qu.:1992-08-10 00:00:00 | 1st Qu.:23.00 | 2 : 924 | 1: 297 | 1st Qu.: 200.0 | 1st Qu.: 38.00 | 2: 336 | 1st Qu.: 700 | 1st Qu.: 926 | 1st Qu.: 13.00 | |
Median :1996-12-28 07:00:00 | Median :1997-02-10 00:00:00 | Median :32.00 | 1 : 858 | 2: 15 | Median : 393.3 | Median : 38.00 | 3: 956 | Median : 2000 | Median : 3376 | Median : 22.00 | |
Mean :1996-12-30 15:58:37 | Mean :1997-02-07 06:58:21 | Mean :33.79 | 3 : 353 | 3: 6 | Mean : 416.4 | Mean : 37.77 | 4: 995 | Mean : 7748 | Mean : 10956 | Mean : 38.68 | |
3rd Qu.:2001-07-05 07:15:00 | 3rd Qu.:2001-08-20 18:00:00 | 3rd Qu.:43.00 | 4 : 103 | NA | 3rd Qu.: 500.0 | 3rd Qu.: 40.00 | 5:32969 | 3rd Qu.: 9500 | 3rd Qu.: 8189 | 3rd Qu.: 40.00 | |
Max. :2005-12-31 10:00:00 | Max. :2006-09-23 00:00:00 | Max. :79.00 | 5 : 34 | NA | Max. :7497.0 | Max. :640.00 | 6: 563 | Max. :830000 | Max. :4027136 | Max. :1088.00 | |
NA | NA | NA | (Other): 6 | NA | NA’s :56 | NA’s :49 | 7: 208 | NA | NA | NA |
sapply(BASE[,sapply(BASE,is.character)],n_distinct)
## ClaimNumber Gender MaritalStatus PartTimeFullTime
## 29438 2 3 2
## ClaimDescription
## 20581
sapply(select(BASE, c("MaritalStatus", "Gender", "PartTimeFullTime")),table)
## $MaritalStatus
##
## M S U
## 15159 17448 3539
##
## $Gender
##
## F M
## 8250 27896
##
## $PartTimeFullTime
##
## F P
## 32887 3259
aggr(BASE[,1:15],
#col= c('blue', 'red'), #Cambiar colores de visualización
numbers = TRUE, #incorporar en que porcentajes nos encontramos
sortVars = TRUE,#ordena las variables faltantes de mayor a menor
labels = names(BASE[,1:15]), #colocar todas las etiquetas de las variables
cex.axis = 0.5,#tamaño de la fuente de las ejes
gap = 1, #para que el "hueco" entre las graficas se disminulla
ylab = c("Histograma de NAs", "Patrón"),# nombre de los ejes y de los graficas se puede hacer lo mismo para el eje de las x con "xlab"
)
##
## Variables sorted by number of missings:
## Variable Count
## WeeklyWages 0.001549272
## HoursWorkedPerWeek 0.001355613
## ClaimNumber 0.000000000
## DateTimeOfAccident 0.000000000
## DateReported 0.000000000
## Age 0.000000000
## Gender 0.000000000
## MaritalStatus 0.000000000
## DependentChildren 0.000000000
## DependentsOther 0.000000000
## PartTimeFullTime 0.000000000
## DaysWorkedPerWeek 0.000000000
## ClaimDescription 0.000000000
## InitialIncurredCalimsCost 0.000000000
## UltimateIncurredClaimCost 0.000000000
BASE =BASE %>% mutate(A.OCURRENCIA=factor(as.numeric(format(DateTimeOfAccident,'%Y'))),
M.OCURRENCIA=factor(as.numeric(format(DateTimeOfAccident,'%m')))) %>%
select(-c("ClaimNumber", "DateReported", "ClaimDescription", "InitialIncurredCalimsCost","DateTimeOfAccident") ) %>% na.omit()
apply(X = is.na(BASE), MARGIN = 2, FUN = sum)
## Age Gender MaritalStatus
## 0 0 0
## DependentChildren DependentsOther WeeklyWages
## 0 0 0
## PartTimeFullTime HoursWorkedPerWeek DaysWorkedPerWeek
## 0 0 0
## UltimateIncurredClaimCost days_Report A.OCURRENCIA
## 0 0 0
## M.OCURRENCIA
## 0
plot_ly(data = BASE,
type = "box",
y = ~UltimateIncurredClaimCost,name="Costo del siniestro"
)
BASE=BASE %>% filter(UltimateIncurredClaimCost<max(UltimateIncurredClaimCost))
plot_ly(data = BASE,
type = "box",
y = ~UltimateIncurredClaimCost, name="Costo del siniestro"
)
summary(BASE)
## Age Gender MaritalStatus DependentChildren
## Min. :13.00 Length:36040 Length:36040 0 :33769
## 1st Qu.:23.00 Class :character Class :character 2 : 922
## Median :32.00 Mode :character Mode :character 1 : 856
## Mean :33.79 3 : 351
## 3rd Qu.:43.00 4 : 102
## Max. :79.00 5 : 34
## (Other): 6
## DependentsOther WeeklyWages PartTimeFullTime HoursWorkedPerWeek
## 0:35722 Min. : 1.0 Length:36040 Min. : 0.00
## 1: 297 1st Qu.: 200.0 Class :character 1st Qu.: 38.00
## 2: 15 Median : 393.3 Mode :character Median : 38.00
## 3: 6 Mean : 416.4 Mean : 37.77
## 3rd Qu.: 500.0 3rd Qu.: 40.00
## Max. :7497.0 Max. :640.00
##
## DaysWorkedPerWeek UltimateIncurredClaimCost days_Report A.OCURRENCIA
## 1: 119 Min. : 121.9 Min. : 0.00 2001 : 2071
## 2: 335 1st Qu.: 926.5 1st Qu.: 13.00 1994 : 2043
## 3: 953 Median : 3373.1 Median : 22.00 2004 : 2028
## 4: 992 Mean : 10851.0 Mean : 38.66 1997 : 2025
## 5:32871 3rd Qu.: 8188.4 3rd Qu.: 40.00 1988 : 2024
## 6: 562 Max. :865770.6 Max. :1088.00 1996 : 2018
## 7: 208 (Other):23831
## M.OCURRENCIA
## 5 : 3285
## 3 : 3177
## 8 : 3135
## 10 : 3125
## 11 : 3093
## 7 : 3077
## (Other):17148
BASE2=BASE %>% select("Age","Gender","MaritalStatus" ,"DependentChildren", "DependentsOther", "WeeklyWages","PartTimeFullTime","HoursWorkedPerWeek" , "DaysWorkedPerWeek","A.OCURRENCIA","M.OCURRENCIA","UltimateIncurredClaimCost", "days_Report" ) %>% na.omit()
str(BASE2)
## 'data.frame': 36040 obs. of 13 variables:
## $ Age : num 48 43 30 41 36 50 39 56 49 30 ...
## $ Gender : chr "M" "F" "M" "M" ...
## $ MaritalStatus : chr "M" "M" "U" "S" ...
## $ DependentChildren : Factor w/ 9 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ DependentsOther : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ WeeklyWages : num 500 509 709 555 377 ...
## $ PartTimeFullTime : chr "F" "F" "F" "F" ...
## $ HoursWorkedPerWeek : num 38 37.5 38 38 38 38 38 40 38 37 ...
## $ DaysWorkedPerWeek : Factor w/ 7 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ A.OCURRENCIA : Factor w/ 18 levels "1988","1989",..: 15 12 9 18 3 12 14 13 7 18 ...
## $ M.OCURRENCIA : Factor w/ 12 levels "1","2","3","4",..: 4 1 3 6 8 6 7 3 3 12 ...
## $ UltimateIncurredClaimCost: num 4748 6326 2294 17786 4014 ...
## $ days_Report : num 87 13 20 29 29 80 10 38 32 15 ...
## - attr(*, "na.action")= 'omit' Named int [1:105] 237 387 443 492 536 589 623 629 630 637 ...
## ..- attr(*, "names")= chr [1:105] "237" "387" "443" "492" ...
pairs(BASE2[,sapply(BASE2,is.numeric)],col ="cadetblue" ,
pch=19, cex=0.5, upper.panel=NULL, cex.labels=1, main='Matriz de dispersión', las=1 )
pairs(BASE2[,sapply(BASE2,is.numeric)] %>% filter(HoursWorkedPerWeek<=100),col ="cadetblue" ,
pch=19, cex=0.5, upper.panel=NULL, cex.labels=1, main='Matriz de dispersión las horas semanales trabajadas <100', las=1 )
corrplot.mixed(cor(BASE2[,sapply(BASE2,is.numeric)]), lower.col = "black", number.cex = .7,
diag = "u", tl.pos="lt" )
columnas.num=as.vector(which(sapply(BASE2,is.numeric)))
fig_c=c()
for(i in 1:length(columnas.num)){
BASE_FIGURAS=BASE2[,sapply(BASE2,is.numeric)]# %>% filter(HoursWorkedPerWeek<=100)
nombre=names(BASE_FIGURAS[i])
BASE_FIGURAS=rename(BASE_FIGURAS,i = colnames (BASE_FIGURAS) [i])
fig_c[i] <- BASE_FIGURAS %>% plot_ly(
y = ~i,
type = 'violin',
box = list(
visible = T
),
meanline = list(
visible = T
),
x0 = nombre
) %>%
layout(
yaxis = list(
title = "",
zeroline = F
)
)
}
fig_c=Filter(Negate(is.null), fig_c)
subplot(fig_c, nrows = round(length(fig_c)/2,0))%>%
layout( title = list(text = "gráfico de violin"))
columnas.char=as.vector(which(sapply(BASE2,is.numeric)))
fig_c=c()
for(i in columnas.char){
BASE_FIGURAS=BASE2
BASE_FIGURAS=rename(BASE_FIGURAS,i = colnames (BASE_FIGURAS) [i])
BASE_fig=BASE_FIGURAS %>% group_by(A.OCURRENCIA) %>% summarise(promedio= round(mean(i, na.rm = T),0))
fig_c[i]= plot_ly(BASE_fig,
x =~A.OCURRENCIA, y =~promedio, type = 'scatter', mode = 'lines',
name = names(BASE2)[i],
line = list(color = colores[i], width = 2), text =~promedio, textposition = 'auto' ) %>%
layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'), barmode = 'group')
}
fig_c=Filter(Negate(is.null), fig_c)
subplot(fig_c, nrows = round(length(fig_c)/2,0))%>%
layout( title = list(text = "Promedio por años de ocurrencia"))
columnas.char=as.vector(which(sapply(BASE2,is.numeric)))
columnas.char=columnas.char[-1]
fig_c=c()
for(i in columnas.char){
BASE_FIGURAS=BASE2
BASE_FIGURAS=rename(BASE_FIGURAS,i = colnames (BASE_FIGURAS) [i])
BASE_fig=BASE_FIGURAS %>% group_by(Age) %>% summarise(promedio= round(mean(i, na.rm = T),0))
fig_c[i]= plot_ly(BASE_fig,
x =~Age, y =~promedio, type = 'scatter', mode = 'lines',
name = names(BASE2)[i],
line = list(color = colores[i], width = 2), text =~promedio, textposition = 'auto' ) %>%
layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'), barmode = 'group')
}
fig_c=Filter(Negate(is.null), fig_c)
subplot(fig_c, nrows = round(length(fig_c)/2,0))%>%
layout( title = list(text = "Promedio por edad"))
columnas.char=as.vector(which(sapply(BASE2,is.numeric)))
columnas.char=columnas.char[-1]
fig_c=c()
for(i in columnas.char){
BASE_FIGURAS=BASE2
BASE_FIGURAS=rename(BASE_FIGURAS,i = colnames (BASE_FIGURAS) [i])
BASE_fig=BASE_FIGURAS %>% group_by(M.OCURRENCIA) %>% summarise(promedio= round(mean(i, na.rm = T),0))
fig_c[i]= plot_ly(BASE_fig,
x =~M.OCURRENCIA, y =~promedio, type = 'scatter', mode = 'lines',
name = names(BASE2)[i],
line = list(color = colores[i], width = 2), text =~promedio, textposition = 'auto' ) %>%
layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'), barmode = 'group')
}
fig_c=Filter(Negate(is.null), fig_c)
subplot(fig_c, nrows = round(length(fig_c)/2,0))%>%
layout( title = list(text = "Promedio por meses de ocurrencia"))
BASE_FIGURAS=BASE2[,sapply(BASE2,is.numeric)] %>% select(-c("UltimateIncurredClaimCost"))
fig_c=c()
for(i in 1:ncol(BASE_FIGURAS)){
BASE_fig=BASE2 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(UltimateIncurredClaimCost),0))
BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[i]= plot_ly(BASE_fig,
x =~i, y =~promedio, type = 'scatter', mode = 'lines',
name = names(BASE_FIGURAS)[i],
line = list(color = colores[length(fig_c)+6], width = 2), text =~promedio, textposition = 'auto' ) %>%
layout(title=names(BASE_FIGURAS)[i])
}
subplot(fig_c, nrows = round(length(fig_c)/2,0))%>%
layout( title = list(text = "Costo medio de las reclamaciones"))
BASE_FIGURAS=BASE2[,!sapply(BASE2,is.numeric)]
fig_c=c()
for(i in 1:ncol(BASE_FIGURAS)){
BASE_fig=as.data.frame(table(BASE_FIGURAS[,i]))
BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[i]= plot_ly(BASE_fig,
x =~i, y =~Freq, type = 'bar', name = names(BASE_FIGURAS)[i],
marker = list(color =colores), text =~Freq, textposition = 'auto' ) %>%
layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'), barmode = 'group')
}
subplot(fig_c, nrows = round(length(fig_c)/3,0))%>%
layout( title = list(text = "Gráfico de frecuencias"))
BASE_FIGURAS=BASE2[,!sapply(BASE2,is.numeric)]
BASE_FIGURAS=BASE_FIGURAS[,-c(ncol(BASE_FIGURAS),ncol(BASE_FIGURAS)-1)]
fig_c=c()
for(i in 1:ncol(BASE_FIGURAS)){
BASE_fig=BASE2 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(UltimateIncurredClaimCost),0))
BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[i]= plot_ly(BASE_fig,
x =~i, y =~promedio, type = 'bar', name = names(BASE_FIGURAS)[i],
marker = list(color =colores), text =~promedio, textposition = 'auto' ) %>%
layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'), barmode = 'group')
}
BASE_FIGURAS=BASE2[,!sapply(BASE2,is.numeric)]
BASE_FIGURAS=BASE_FIGURAS[,c(ncol(BASE_FIGURAS),ncol(BASE_FIGURAS)-1)]
for(i in 1:ncol(BASE_FIGURAS)){
BASE_fig=BASE2 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(UltimateIncurredClaimCost),0))
BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[length(fig_c)+1]= plot_ly(BASE_fig,
x =~i, y =~promedio, type = 'scatter', mode = 'lines',
name = names(BASE_FIGURAS)[i],
line = list(color = colores[length(fig_c)+6], width = 2), text =~promedio, textposition = 'auto' ) %>%
layout(title=names(BASE_FIGURAS)[i])
}
subplot(fig_c, nrows = round(length(fig_c)/3,0))%>%
layout( title = list(text = "Costo medio de las reclamaciones"))
BASE_FIGURAS=BASE2[,!sapply(BASE2,is.numeric)]
fig_c=c()
for(i in 1:ncol(BASE_FIGURAS)){
BASE_fig=BASE2 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(WeeklyWages),0))
BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[i]= plot_ly(BASE_fig,
x =~i, y =~promedio, type = 'bar', name = names(BASE_FIGURAS)[i],
marker = list(color =colores), text =~promedio, textposition = 'auto' ) %>%
layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'), barmode = 'group')
}
subplot(fig_c, nrows = round(length(fig_c)/3,0))%>%
layout( title = list(text = "Salario promedio semanal"))
BASE3=BASE2 %>% select("Age","Gender","MaritalStatus" ,"DependentChildren", "DependentsOther", "WeeklyWages","PartTimeFullTime","HoursWorkedPerWeek" , "DaysWorkedPerWeek","A.OCURRENCIA","M.OCURRENCIA","UltimateIncurredClaimCost", "days_Report" ) %>% filter( HoursWorkedPerWeek<=112, DependentChildren %in% c(0:6) ) %>%
mutate(A.OCURRENCIA=as.numeric(as.character(A.OCURRENCIA)),
M.OCURRENCIA=as.numeric(M.OCURRENCIA),
DaysWorkedPerWeek=as.numeric(DaysWorkedPerWeek),
DependentChildren=as.numeric(DependentChildren),
DependentsOther=as.numeric(DependentsOther))
BASE3=dummy.data.frame(BASE3,names = c("Gender"), sep=".")
BASE3=dummy.data.frame(BASE3,names = c("MaritalStatus"), sep=".")
BASE3=dummy.data.frame(BASE3,names = c("PartTimeFullTime"), sep="." )
BASE3=BASE3 %>% select(-c("PartTimeFullTime.P","MaritalStatus.U", "Gender.F"))
columnas.num=as.vector(which(names(BASE3) %in% names(BASE2[,sapply(BASE2,is.numeric)])))
kable(summary(BASE3[,columnas.num]))
Age | WeeklyWages | HoursWorkedPerWeek | UltimateIncurredClaimCost | days_Report | |
---|---|---|---|---|---|
Min. :13.00 | Min. : 1.0 | Min. : 0.00 | Min. : 121.9 | Min. : 0.00 | |
1st Qu.:23.00 | 1st Qu.: 200.0 | 1st Qu.:38.00 | 1st Qu.: 925.7 | 1st Qu.: 13.00 | |
Median :32.00 | Median : 393.3 | Median :38.00 | Median : 3372.9 | Median : 22.00 | |
Mean :33.79 | Mean : 416.4 | Mean :37.49 | Mean : 10838.7 | Mean : 38.63 | |
3rd Qu.:43.00 | 3rd Qu.: 500.0 | 3rd Qu.:40.00 | 3rd Qu.: 8190.0 | 3rd Qu.: 40.00 | |
Max. :79.00 | Max. :7497.0 | Max. :93.00 | Max. :865770.6 | Max. :1088.00 |
columnas.num=as.vector(which(names(BASE3) %in% names(BASE2[,sapply(BASE2,is.numeric)])))
fig_c=c()
for(i in 1:length(columnas.num)){
BASE_FIGURAS=as.data.frame(BASE3[,columnas.num])# %>% filter(HoursWorkedPerWeek<=100)
nombre=names(BASE_FIGURAS[i])
BASE_FIGURAS=rename(BASE_FIGURAS,i = colnames (BASE_FIGURAS) [i])
fig_c[i] <-BASE_FIGURAS %>% plot_ly(
y = ~i,
type = 'violin',
box = list(
visible = T
),
meanline = list(
visible = T
),
x0 = nombre
) %>%
layout(
yaxis = list(
title = "",
zeroline = F
)
)
}
fig_c=Filter(Negate(is.null), fig_c)
subplot(fig_c, nrows =2)%>%
layout( title = list(text = ""))
BASE_FIGURAS=BASE2 %>% select("Age","Gender","MaritalStatus" ,"DependentChildren", "DependentsOther", "WeeklyWages","PartTimeFullTime","HoursWorkedPerWeek" , "DaysWorkedPerWeek","A.OCURRENCIA","M.OCURRENCIA","UltimateIncurredClaimCost", "days_Report" ) %>% filter( HoursWorkedPerWeek<=105, DependentChildren %in% c(0:6) )
BASE_FIGURAS=BASE_FIGURAS[,!sapply(BASE_FIGURAS,is.numeric)]
BASE_FIGURAS=BASE_FIGURAS[,-c(ncol(BASE_FIGURAS),ncol(BASE_FIGURAS)-1)]
fig_c=c()
for(i in 1:ncol(BASE_FIGURAS)){
BASE_fig=BASE3 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(UltimateIncurredClaimCost),0))
BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[i]= plot_ly(BASE_fig,
x =~i, y =~promedio, type = 'bar', name = names(BASE_FIGURAS)[i],
marker = list(color =colores), text =~promedio, textposition = 'auto' ) %>%
layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'), barmode = 'group')
}
BASE_FIGURAS=BASE3 %>% select("A.OCURRENCIA","M.OCURRENCIA")
for(i in 1:ncol(BASE_FIGURAS)){
BASE_fig=BASE3 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(UltimateIncurredClaimCost),0))
BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[length(fig_c)+1]= plot_ly(BASE_fig,
x =~i, y =~promedio, type = 'scatter', mode = 'lines',
name = names(BASE_FIGURAS)[i],
line = list(color = colores[length(fig_c)+6], width = 2), text =~promedio, textposition = 'auto' ) %>%
layout(title=names(BASE_FIGURAS)[i])
}
names(BASE3)
## [1] "Age" "Gender.M"
## [3] "MaritalStatus.M" "MaritalStatus.S"
## [5] "DependentChildren" "DependentsOther"
## [7] "WeeklyWages" "PartTimeFullTime.F"
## [9] "HoursWorkedPerWeek" "DaysWorkedPerWeek"
## [11] "A.OCURRENCIA" "M.OCURRENCIA"
## [13] "UltimateIncurredClaimCost" "days_Report"
subplot(fig_c, nrows = round(length(fig_c)/3,0))%>%
layout( title = list(text = "Costo medio de las reclamaciones"))
BASE.Z=BASE3 %>% mutate(WeeklyWages= as.vector(scale(WeeklyWages)),
HoursWorkedPerWeek=as.vector(scale(HoursWorkedPerWeek)),
DaysWorkedPerWeek=as.vector(scale(DaysWorkedPerWeek)),
UltimateIncurredClaimCost=as.vector(scale(UltimateIncurredClaimCost)),
days_Report=as.vector(scale(days_Report))
)
columnas.num=as.vector(which(names(BASE3) %in% names(BASE2[,sapply(BASE2,is.numeric)])))
kable(summary(BASE.Z[,columnas.num]))
Age | WeeklyWages | HoursWorkedPerWeek | UltimateIncurredClaimCost | days_Report | |
---|---|---|---|---|---|
Min. :13.00 | Min. :-1.70277 | Min. :-5.36174 | Min. :-0.37966 | Min. :-0.64163 | |
1st Qu.:23.00 | 1st Qu.:-0.88709 | 1st Qu.: 0.07275 | 1st Qu.:-0.35119 | 1st Qu.:-0.42570 | |
Median :32.00 | Median :-0.09477 | Median : 0.07275 | Median :-0.26449 | Median :-0.27621 | |
Mean :33.79 | Mean : 0.00000 | Mean : 0.00000 | Mean : 0.00000 | Mean : 0.00000 | |
3rd Qu.:43.00 | 3rd Qu.: 0.34258 | 3rd Qu.: 0.35877 | 3rd Qu.:-0.09384 | 3rd Qu.: 0.02276 | |
Max. :79.00 | Max. :29.02254 | Max. : 7.93845 | Max. :30.28758 | Max. :17.42971 |
OUT=quantile(BASE3$UltimateIncurredClaimCost, 0.75)+1.5*IQR(BASE3$UltimateIncurredClaimCost)
BASE_NOT_OUT=BASE3 %>% filter(UltimateIncurredClaimCost<=OUT)
columnas.num=as.vector(which(names(BASE3) %in% names(BASE2[,sapply(BASE2,is.numeric)])))
kable(summary(BASE_NOT_OUT[,columnas.num]))
Age | WeeklyWages | HoursWorkedPerWeek | UltimateIncurredClaimCost | days_Report | |
---|---|---|---|---|---|
Min. :13.00 | Min. : 1.0 | Min. : 0.00 | Min. : 121.9 | Min. : 0.00 | |
1st Qu.:23.00 | 1st Qu.: 200.0 | 1st Qu.:38.00 | 1st Qu.: 789.4 | 1st Qu.: 13.00 | |
Median :31.00 | Median : 376.0 | Median :38.00 | Median : 2424.9 | Median : 22.00 | |
Mean :33.16 | Mean : 396.8 | Mean :37.46 | Mean : 4011.1 | Mean : 37.43 | |
3rd Qu.:42.00 | 3rd Qu.: 500.0 | 3rd Qu.:40.00 | 3rd Qu.: 6063.7 | 3rd Qu.: 40.00 | |
Max. :79.00 | Max. :7497.0 | Max. :93.00 | Max. :19082.4 | Max. :1088.00 |
#PARTICIÓN DE BASES
set.seed(2021)
entrenamiento=createDataPartition(BASE3$UltimateIncurredClaimCost, p=0.7, list = F)
#Escenario 1 y 2
BASE_ENTRENAMIENTO3=BASE3[entrenamiento,]
BASE_VAL_3=BASE3[-entrenamiento,]
#Escenario 3
BASE_ENTRENAMIENTO.Z=BASE.Z[entrenamiento,]
BASE_VAL_Z=BASE.Z[-entrenamiento,]
#Escenario 4
set.seed(2021)
entrenamiento=createDataPartition(BASE_NOT_OUT$UltimateIncurredClaimCost, p=0.7, list = F)
BASE_ENTRENAMIENTO.NOT_OUT=BASE_NOT_OUT[entrenamiento,]
BASE_VAL_NOT_OUT=BASE_NOT_OUT[-entrenamiento,]
mod= lm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3 )
plot(mod)
summary(mod)
##
## Call:
## lm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -103030 -9190 -4505 423 859834
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.557e+06 7.207e+04 -21.596 < 2e-16 ***
## Age 1.883e+02 1.734e+01 10.864 < 2e-16 ***
## Gender.M -1.679e+03 4.341e+02 -3.868 0.00011 ***
## MaritalStatus.M -3.964e+03 6.338e+02 -6.254 4.06e-10 ***
## MaritalStatus.S -4.400e+03 6.287e+02 -6.999 2.64e-12 ***
## DependentChildren 1.004e+03 3.531e+02 2.845 0.00444 **
## DependentsOther 1.069e+04 1.734e+03 6.163 7.25e-10 ***
## WeeklyWages 1.498e+01 8.266e-01 18.126 < 2e-16 ***
## PartTimeFullTime.F -2.580e+03 8.486e+02 -3.040 0.00236 **
## HoursWorkedPerWeek -2.780e+01 3.760e+01 -0.739 0.45966
## DaysWorkedPerWeek -1.071e+02 4.638e+02 -0.231 0.81744
## A.OCURRENCIA 7.768e+02 3.608e+01 21.527 < 2e-16 ***
## M.OCURRENCIA 7.204e+01 5.110e+01 1.410 0.15859
## days_Report 1.687e+01 2.937e+00 5.745 9.30e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27670 on 25197 degrees of freedom
## Multiple R-squared: 0.06576, Adjusted R-squared: 0.06528
## F-statistic: 136.4 on 13 and 25197 DF, p-value: < 2.2e-16
pred= as.vector(predict(mod,BASE_VAL_3))
real=as.vector(BASE_VAL_3$UltimateIncurredClaimCost)
#MAE
mean(abs(pred-real),na.rm = TRUE)
## [1] 11393.7
#R-Cuadrado
(1-(sum((real-pred)^2)/sum((real-mean(real))^2)))*100
## [1] 6.034909
POCO_SIG=as.vector(which(names(BASE_ENTRENAMIENTO3) %in% c("Age","MaritalStatus.M","MaritalStatus.S","DependentChildren",
"DependentsOther", "WeeklyWages", "PartTimeFullTime.F",
"A.OCURRENCIA", "days_Report", "UltimateIncurredClaimCost")))
mod2= lm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3[,POCO_SIG])
plot(mod2)
summary(mod2)
##
## Call:
## lm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3[,
## POCO_SIG])
##
## Residuals:
## Min 1Q Median 3Q Max
## -100544 -9186 -4538 433 859561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.565e+06 7.198e+04 -21.737 < 2e-16 ***
## Age 1.924e+02 1.728e+01 11.135 < 2e-16 ***
## MaritalStatus.M -3.943e+03 6.336e+02 -6.223 4.95e-10 ***
## MaritalStatus.S -4.386e+03 6.284e+02 -6.980 3.03e-12 ***
## DependentChildren 9.962e+02 3.531e+02 2.822 0.00478 **
## DependentsOther 1.056e+04 1.734e+03 6.087 1.16e-09 ***
## WeeklyWages 1.455e+01 7.929e-01 18.347 < 2e-16 ***
## PartTimeFullTime.F -3.673e+03 6.248e+02 -5.878 4.20e-09 ***
## A.OCURRENCIA 7.803e+02 3.605e+01 21.642 < 2e-16 ***
## days_Report 1.726e+01 2.936e+00 5.877 4.23e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27680 on 25201 degrees of freedom
## Multiple R-squared: 0.06506, Adjusted R-squared: 0.06473
## F-statistic: 194.9 on 9 and 25201 DF, p-value: < 2.2e-16
pred= as.vector(predict(mod2,BASE_VAL_3[,POCO_SIG]))
real=as.vector(BASE_VAL_3$UltimateIncurredClaimCost)
#MAE
mean(abs(pred-real),na.rm = TRUE)
## [1] 11401.28
#R-Cuadrado
(1-(sum((real-pred)^2)/sum((real-mean(real))^2)))*100
## [1] 5.992989
mod= lm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.Z )
plot(mod)
summary(mod)
##
## Call:
## lm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.Z)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6500 -0.3256 -0.1596 0.0150 30.4612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.534e+01 2.556e+00 -21.648 < 2e-16 ***
## Age 6.672e-03 6.142e-04 10.864 < 2e-16 ***
## Gender.M -5.949e-02 1.538e-02 -3.868 0.00011 ***
## MaritalStatus.M -1.404e-01 2.245e-02 -6.254 4.06e-10 ***
## MaritalStatus.S -1.559e-01 2.227e-02 -6.999 2.64e-12 ***
## DependentChildren 3.559e-02 1.251e-02 2.845 0.00444 **
## DependentsOther 3.786e-01 6.143e-02 6.163 7.25e-10 ***
## WeeklyWages 1.295e-01 7.145e-03 18.126 < 2e-16 ***
## PartTimeFullTime.F -9.141e-02 3.006e-02 -3.040 0.00236 **
## HoursWorkedPerWeek -6.886e-03 9.313e-03 -0.739 0.45966
## DaysWorkedPerWeek -2.074e-03 8.985e-03 -0.231 0.81744
## A.OCURRENCIA 2.752e-02 1.278e-03 21.527 < 2e-16 ***
## M.OCURRENCIA 2.552e-03 1.810e-03 1.410 0.15859
## days_Report 3.599e-02 6.264e-03 5.745 9.30e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9803 on 25197 degrees of freedom
## Multiple R-squared: 0.06576, Adjusted R-squared: 0.06528
## F-statistic: 136.4 on 13 and 25197 DF, p-value: < 2.2e-16
pred= as.vector(predict(mod,BASE_VAL_Z))
estand_pred=pred*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
estand_real=BASE_VAL_Z$UltimateIncurredClaimCost*sd(BASE3$UltimateIncurredClaimCost)+
mean(BASE3$UltimateIncurredClaimCost)
#MAE
mean(abs(estand_pred-estand_real), na.rm = TRUE)
## [1] 11393.7
#R-Cuadrado
(1-(sum((estand_real-estand_pred)^2)/sum((estand_real-mean(estand_real))^2)))*100
## [1] 6.034909
mod= lm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.NOT_OUT)
plot(mod)
summary(mod)
##
## Call:
## lm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.NOT_OUT)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29015 -2705 -1229 2026 16480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.982e+05 1.085e+04 -18.261 < 2e-16 ***
## Age 3.527e+01 2.615e+00 13.489 < 2e-16 ***
## Gender.M -7.310e+02 6.542e+01 -11.174 < 2e-16 ***
## MaritalStatus.M -1.052e+03 9.783e+01 -10.752 < 2e-16 ***
## MaritalStatus.S -1.338e+03 9.654e+01 -13.861 < 2e-16 ***
## DependentChildren 1.087e+01 5.378e+01 0.202 0.83986
## DependentsOther 8.105e+02 2.668e+02 3.037 0.00239 **
## WeeklyWages 3.882e+00 1.238e-01 31.362 < 2e-16 ***
## PartTimeFullTime.F -6.519e+02 1.302e+02 -5.008 5.53e-07 ***
## HoursWorkedPerWeek -1.111e+01 5.820e+00 -1.908 0.05637 .
## DaysWorkedPerWeek 7.490e+01 7.077e+01 1.058 0.28989
## A.OCURRENCIA 1.007e+02 5.433e+00 18.530 < 2e-16 ***
## M.OCURRENCIA -1.281e+00 7.638e+00 -0.168 0.86679
## days_Report 1.575e+00 4.791e-01 3.289 0.00101 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3862 on 22042 degrees of freedom
## Multiple R-squared: 0.1141, Adjusted R-squared: 0.1136
## F-statistic: 218.4 on 13 and 22042 DF, p-value: < 2.2e-16
pred= as.vector(predict(mod,BASE_VAL_NOT_OUT))
real=BASE_VAL_NOT_OUT$UltimateIncurredClaimCost
#MAE
mean(abs(pred-real), na.rm = TRUE)
## [1] 3000.209
#R-Cuadrado
(1-(sum((real-pred)^2)/sum((real-mean(real))^2)))*100
## [1] 11.91927
mod_tree=rpart(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3)
prp(mod_tree, type = 2,nn = T, fallen.leaves = T,
faclen = 4, varlen = 8, shadow.col = "gray")
mod_tree
## n= 25211
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 25211 2.065161e+13 10910.540
## 2) A.OCURRENCIA< 1999.5 16822 7.045954e+12 6975.721 *
## 3) A.OCURRENCIA>=1999.5 8389 1.282294e+13 18800.810
## 6) WeeklyWages< 523.075 5386 5.077628e+12 13266.120 *
## 7) WeeklyWages>=523.075 3003 7.284409e+12 28727.490 *
mod_tree$cptable
## CP nsplit rel error xerror xstd
## 1 0.03790115 0 1.0000000 1.0001181 0.07211989
## 2 0.02231791 1 0.9620988 0.9625563 0.07095114
## 3 0.01000000 2 0.9397809 0.9417381 0.07045817
pred_tree=as.vector(predict(mod_tree,BASE_VAL_3))
real=BASE_VAL_3$UltimateIncurredClaimCost
#MAE
mean(abs(pred_tree-real),na.rm = TRUE)
## [1] 11264.64
#R-Cuadrado
(1-(sum((real-pred_tree)^2)/sum((real-mean(real))^2)))*100
## [1] 6.157961
mod_tree=rpart(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3[,POCO_SIG])
prp(mod_tree, type = 2,nn = T, fallen.leaves = T,
faclen = 4, varlen = 8, shadow.col = "gray")
mod_tree$cptable
## CP nsplit rel error xerror xstd
## 1 0.03790115 0 1.0000000 1.0000951 0.07212042
## 2 0.02231791 1 0.9620988 0.9622218 0.07093867
## 3 0.01000000 2 0.9397809 0.9420644 0.07044865
pred_tree=as.vector(predict(mod_tree,BASE_VAL_3[,POCO_SIG]))
real=BASE_VAL_3$UltimateIncurredClaimCost
#MAE
mean(abs(pred_tree-real),na.rm = TRUE)
## [1] 11264.64
#R-Cuadrado
(1-(sum((real-pred_tree)^2)/sum((real-mean(real))^2)))*100
## [1] 6.157961
mod_tree=rpart(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.Z)
prp(mod_tree, type = 2,nn = T, fallen.leaves = T,
faclen = 4, varlen = 8, shadow.col = "gray")
mod_tree$cptable
## CP nsplit rel error xerror xstd
## 1 0.03790115 0 1.0000000 1.0000794 0.07211553
## 2 0.02231791 1 0.9620988 0.9623013 0.07094538
## 3 0.01000000 2 0.9397809 0.9417319 0.07044848
pred_tree=as.vector(predict(mod_tree,BASE_VAL_Z))
estand_pred=pred_tree*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
estand_real=BASE_VAL_Z$UltimateIncurredClaimCost*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
#MAE
mean(abs(estand_pred-estand_real), na.rm = TRUE)
## [1] 11264.64
#R-Cuadrado
(1-(sum((estand_real-estand_pred)^2)/sum((estand_real-mean(estand_real))^2)))*100
## [1] 6.157961
mod_tree=rpart(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.NOT_OUT)
prp(mod_tree, type = 2,nn = T, fallen.leaves = T,
faclen = 4, varlen = 8, shadow.col = "gray")
mod_tree$cptable
## CP nsplit rel error xerror xstd
## 1 0.06507587 0 1.0000000 1.0000816 0.01284651
## 2 0.01666612 1 0.9349241 0.9363668 0.01220443
## 3 0.01565740 2 0.9182580 0.9255196 0.01212229
## 4 0.01103526 3 0.9026006 0.9041626 0.01192961
## 5 0.01000000 5 0.8805301 0.8864486 0.01169398
pred_tree=as.vector(predict(mod_tree,BASE_VAL_NOT_OUT))
real=BASE_VAL_NOT_OUT$UltimateIncurredClaimCost
# MAE
mean(abs(pred_tree-real), na.rm = TRUE)
## [1] 3012.907
#R-Cuadrado
(1-(sum((real-pred_tree)^2)/sum((real-mean(real))^2)))*100
## [1] 10.82735
set.seed(2021)
mod_Neuronal= nnet(UltimateIncurredClaimCost~., data=BASE_ENTRENAMIENTO.NEURONAL,
size=14, decay=0.1, maxit=10000, linout=T)
source_url("https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r")
plot(mod_Neuronal, max.sp = T)
Para que el modelo de red neuronal converja necesita 1.040 ieraciones,
maximo=max(BASE3$UltimateIncurredClaimCost)
pred_NEU= as.vector(predict(mod_Neuronal, BASE_VAL_MEURONAL))*maximo
real=BASE_VAL_MEURONAL$UltimateIncurredClaimCost*maximo
#MAE
mean(abs(pred_NEU-real) ,na.rm = TRUE)
## [1] 11298.16
#R-Cuadrado
(1-(sum((real-pred_NEU)^2)/sum((real-mean(real))^2)))*100
## [1] 5.495078
set.seed(2021)
mod_Neuronal= nnet(UltimateIncurredClaimCost/maximo ~., data=BASE_ENTRENAMIENTO3[,POCO_SIG],
size=6, decay=0.1, maxit=10000, linout=T)
source_url("https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r")
plot(mod_Neuronal, max.sp = T)
pred_NEU= as.vector(predict(mod_Neuronal, BASE_VAL_3[,POCO_SIG]))*maximo
real=BASE_VAL_3$UltimateIncurredClaimCost
#MAE
mean(abs(pred_NEU-real),na.rm = TRUE)
## [1] 11364.42
#R-Cuadrado
(1-(sum((real-pred_NEU)^2)/sum((real-mean(real))^2)))*100
## [1] 5.29271
set.seed(2021)
mod_Neuronal= nnet(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.Z,
size=6, decay=0.1, maxit=1000, linout=T)
pred_NEU= as.vector(predict(mod_Neuronal, BASE_VAL_Z))
estand_pred=pred_NEU*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
estand_real=BASE_VAL_Z$UltimateIncurredClaimCost*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
#MAE
mean(abs(estand_pred-estand_real), na.rm = TRUE)
## [1] 11247.53
#R-Cuadrado
(1-(sum((estand_real-estand_pred)^2)/sum((estand_real-mean(estand_real))^2)))*100
## [1] 5.485392
plot(mod_Neuronal, max.sp = T)
set.seed(2021)
mod_Neuronal= nnet(UltimateIncurredClaimCost/maximo ~., data=BASE_ENTRENAMIENTO.NOT_OUT,
size=6, decay=0.1, maxit=10000, linout=T)
plot(mod_Neuronal, max.sp = T)
pred_NEU= as.vector(predict(mod_Neuronal, BASE_VAL_NOT_OUT))*maximo
real= BASE_VAL_NOT_OUT$UltimateIncurredClaimCost
#MAE
mean(abs(pred_NEU-real), na.rm = TRUE)
## [1] 2974.487
#R-Cuadrado
(1-(sum((real-pred_NEU)^2)/sum((real-mean(real))^2)))*100
## [1] 11.76277
set.seed(2021)
baggig.fit= bagging(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3)
baggig.fit
##
## Bagging regression trees with 25 bootstrap replications
##
## Call: bagging.data.frame(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3)
pred_bagging=as.vector(predict(baggig.fit,BASE_VAL_3))
real=BASE_VAL_3$UltimateIncurredClaimCost
#MAE
mean(abs(pred_bagging-real), na.rm = TRUE)
## [1] 11234.32
#R-Cuadrado
(1-(sum((real-pred_bagging)^2)/sum((real-mean(real))^2)))*100
## [1] 6.412791
#calculate variable importance
VI <- data.frame(var= row.names(varImp(baggig.fit)), imp=varImp(baggig.fit))
#sort variable importance descending
VI_plot <- VI[order(VI$Overall, decreasing=TRUE),]
kable(VI_plot)
var | Overall | |
---|---|---|
WeeklyWages | WeeklyWages | 0.0994027 |
A.OCURRENCIA | A.OCURRENCIA | 0.0495111 |
Age | Age | 0.0441905 |
days_Report | days_Report | 0.0230375 |
MaritalStatus.S | MaritalStatus.S | 0.0187173 |
DependentsOther | DependentsOther | 0.0081145 |
HoursWorkedPerWeek | HoursWorkedPerWeek | 0.0062675 |
M.OCURRENCIA | M.OCURRENCIA | 0.0036592 |
DependentChildren | DependentChildren | 0.0027026 |
MaritalStatus.M | MaritalStatus.M | 0.0022443 |
DaysWorkedPerWeek | DaysWorkedPerWeek | 0.0008063 |
Gender.M | Gender.M | 0.0003364 |
PartTimeFullTime.F | PartTimeFullTime.F | 0.0002007 |
set.seed(2021)
baggig.fit= bagging(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3[,POCO_SIG])
baggig.fit
##
## Bagging regression trees with 25 bootstrap replications
##
## Call: bagging.data.frame(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3[,
## POCO_SIG])
pred_bagging=as.vector(predict(baggig.fit,BASE_VAL_3[,POCO_SIG]))
real=BASE_VAL_3$UltimateIncurredClaimCost
#MAE
mean(abs(pred_bagging-real), na.rm = TRUE)
## [1] 11234.34
#R-Cuadrado
(1-(sum((real-pred_bagging)^2)/sum((real-mean(real))^2)))*100
## [1] 6.376642
#calculate variable importance
VI <- data.frame(var= row.names(varImp(baggig.fit)), imp=varImp(baggig.fit))
#sort variable importance descending
VI_plot <- VI[order(VI$Overall, decreasing=TRUE),]
kable(VI_plot)
var | Overall | |
---|---|---|
WeeklyWages | WeeklyWages | 0.1110390 |
A.OCURRENCIA | A.OCURRENCIA | 0.0497348 |
Age | Age | 0.0465381 |
days_Report | days_Report | 0.0222232 |
MaritalStatus.S | MaritalStatus.S | 0.0197420 |
DependentsOther | DependentsOther | 0.0086510 |
DependentChildren | DependentChildren | 0.0030683 |
MaritalStatus.M | MaritalStatus.M | 0.0012196 |
PartTimeFullTime.F | PartTimeFullTime.F | 0.0004726 |
set.seed(2021)
baggig.fit= bagging(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.Z)
baggig.fit
##
## Bagging regression trees with 25 bootstrap replications
##
## Call: bagging.data.frame(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.Z)
pred_bagging=as.vector(predict(baggig.fit,BASE_VAL_Z))
estand_pred=pred_bagging*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
estand_real=BASE_VAL_Z$UltimateIncurredClaimCost*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
#MAE
mean(abs(estand_pred-estand_real), na.rm = TRUE)
## [1] 11234.32
#R-Cuadrado
(1-(sum((estand_real-estand_pred)^2)/sum((estand_real-mean(estand_real))^2)))*100
## [1] 6.412791
#calculate variable importance
VI <- data.frame(var= row.names(varImp(baggig.fit)), imp=varImp(baggig.fit))
#sort variable importance descending
VI_plot <- VI[order(VI$Overall, decreasing=TRUE),]
kable(VI_plot)
var | Overall | |
---|---|---|
WeeklyWages | WeeklyWages | 0.0994027 |
A.OCURRENCIA | A.OCURRENCIA | 0.0495111 |
Age | Age | 0.0441905 |
days_Report | days_Report | 0.0230375 |
MaritalStatus.S | MaritalStatus.S | 0.0187173 |
DependentsOther | DependentsOther | 0.0081145 |
HoursWorkedPerWeek | HoursWorkedPerWeek | 0.0062675 |
M.OCURRENCIA | M.OCURRENCIA | 0.0036592 |
DependentChildren | DependentChildren | 0.0027026 |
MaritalStatus.M | MaritalStatus.M | 0.0022443 |
DaysWorkedPerWeek | DaysWorkedPerWeek | 0.0008063 |
Gender.M | Gender.M | 0.0003364 |
PartTimeFullTime.F | PartTimeFullTime.F | 0.0002007 |
set.seed(2021)
baggig.fit= bagging(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.NOT_OUT)
baggig.fit
##
## Bagging regression trees with 25 bootstrap replications
##
## Call: bagging.data.frame(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.NOT_OUT)
pred_bagging=as.vector(predict(baggig.fit,BASE_VAL_NOT_OUT))
real=BASE_VAL_NOT_OUT$UltimateIncurredClaimCost
#MAE
mean(abs(pred_bagging-real), na.rm = TRUE)
## [1] 3010.051
#R-Cuadrado
(1-(sum((real-pred_bagging)^2)/sum((real-mean(real))^2)))*100
## [1] 11.23597
#calculate variable importance
VI <- data.frame(var= row.names(varImp(baggig.fit)), imp=varImp(baggig.fit))
#sort variable importance descending
VI_plot <- VI[order(VI$Overall, decreasing=TRUE),]
kable(VI_plot)
var | Overall | |
---|---|---|
WeeklyWages | WeeklyWages | 0.1555472 |
A.OCURRENCIA | A.OCURRENCIA | 0.1410475 |
Age | Age | 0.0798281 |
MaritalStatus.S | MaritalStatus.S | 0.0494984 |
Gender.M | Gender.M | 0.0401547 |
HoursWorkedPerWeek | HoursWorkedPerWeek | 0.0202507 |
days_Report | days_Report | 0.0101590 |
PartTimeFullTime.F | PartTimeFullTime.F | 0.0044432 |
MaritalStatus.M | MaritalStatus.M | 0.0003162 |
DaysWorkedPerWeek | DaysWorkedPerWeek | 0.0000000 |
DependentChildren | DependentChildren | 0.0000000 |
DependentsOther | DependentsOther | 0.0000000 |
M.OCURRENCIA | M.OCURRENCIA | 0.0000000 |
set.seed(2021)
gbm.fit= gbm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3)
## Distribution not specified, assuming gaussian ...
gbm.fit
## gbm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## There were 13 predictors of which 10 had non-zero influence.
pred_gbm=as.vector(predict(gbm.fit,BASE_VAL_3))
real=BASE_VAL_3$UltimateIncurredClaimCost
#MAE
mean(abs(pred_gbm-real), na.rm = TRUE)
## [1] 11096.3
#R-Cuadrado
(1-(sum((real-pred_gbm)^2)/sum((real-mean(real))^2)))*100
## [1] 6.994316
summary(
gbm.fit,
cBars = 13,
method = relative.influence, # also can use permutation.test.gbm
las = 2
)
## var rel.inf
## A.OCURRENCIA A.OCURRENCIA 38.3164846
## WeeklyWages WeeklyWages 36.6781550
## Age Age 10.8373733
## days_Report days_Report 6.7477085
## DependentsOther DependentsOther 3.8643538
## Gender.M Gender.M 0.8902729
## DependentChildren DependentChildren 0.7458376
## MaritalStatus.S MaritalStatus.S 0.7328978
## PartTimeFullTime.F PartTimeFullTime.F 0.6358794
## HoursWorkedPerWeek HoursWorkedPerWeek 0.5510370
## MaritalStatus.M MaritalStatus.M 0.0000000
## DaysWorkedPerWeek DaysWorkedPerWeek 0.0000000
## M.OCURRENCIA M.OCURRENCIA 0.0000000
set.seed(2021)
gbm.fit= gbm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3[,POCO_SIG])
## Distribution not specified, assuming gaussian ...
gbm.fit
## gbm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3[,
## POCO_SIG])
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## There were 9 predictors of which 8 had non-zero influence.
pred_gbm=as.vector(predict(gbm.fit,BASE_VAL_3[,POCO_SIG]))
real=BASE_VAL_3$UltimateIncurredClaimCost
#MAE
mean(abs(pred_gbm-real), na.rm = TRUE)
## [1] 11102.62
#R_cuadrado
(1-(sum((real-pred_gbm)^2)/sum((real-mean(real))^2)))*100
## [1] 6.974192
summary(
gbm.fit,
cBars = 13,
method = relative.influence, # also can use permutation.test.gbm
las = 2
)
## var rel.inf
## A.OCURRENCIA A.OCURRENCIA 38.7258381
## WeeklyWages WeeklyWages 36.8715889
## Age Age 10.8700136
## days_Report days_Report 6.8378804
## DependentsOther DependentsOther 3.8749683
## PartTimeFullTime.F PartTimeFullTime.F 1.1490445
## MaritalStatus.S MaritalStatus.S 0.9224584
## DependentChildren DependentChildren 0.7482078
## MaritalStatus.M MaritalStatus.M 0.0000000
set.seed(2021)
gbm.fit= gbm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.Z)
## Distribution not specified, assuming gaussian ...
gbm.fit
## gbm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.Z)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## There were 13 predictors of which 10 had non-zero influence.
pred_gbm=as.vector(predict(gbm.fit,BASE_VAL_Z))
estand_pred=pred_gbm*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
estand_real=BASE_VAL_Z$UltimateIncurredClaimCost*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
#MAE
mean(abs(estand_pred-estand_real), na.rm = TRUE)
## [1] 11096.3
# R-Cuadrado
(1-(sum((estand_real-estand_pred)^2)/sum((estand_real-mean(estand_real))^2)))*100
## [1] 6.994316
summary(
gbm.fit,
cBars = 13,
method = relative.influence, # also can use permutation.test.gbm
las = 2
)
## var rel.inf
## A.OCURRENCIA A.OCURRENCIA 38.3164846
## WeeklyWages WeeklyWages 36.6781550
## Age Age 10.8373733
## days_Report days_Report 6.7477085
## DependentsOther DependentsOther 3.8643538
## Gender.M Gender.M 0.8902729
## DependentChildren DependentChildren 0.7458376
## MaritalStatus.S MaritalStatus.S 0.7328978
## PartTimeFullTime.F PartTimeFullTime.F 0.6358794
## HoursWorkedPerWeek HoursWorkedPerWeek 0.5510370
## MaritalStatus.M MaritalStatus.M 0.0000000
## DaysWorkedPerWeek DaysWorkedPerWeek 0.0000000
## M.OCURRENCIA M.OCURRENCIA 0.0000000
set.seed(2021)
gbm.fit= gbm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.NOT_OUT)
## Distribution not specified, assuming gaussian ...
gbm.fit
## gbm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.NOT_OUT)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## There were 13 predictors of which 7 had non-zero influence.
pred_gbm=as.vector(predict(gbm.fit,BASE_VAL_NOT_OUT))
real=BASE_VAL_NOT_OUT$UltimateIncurredClaimCost
#MAE
mean(abs(pred_gbm-real), na.rm = TRUE)
## [1] 2965.066
#R_cuadrado
(1-(sum((real-pred_gbm)^2)/sum((real-mean(real))^2)))*100
## [1] 13.61934
summary(
gbm.fit,
cBars = 13,
method = relative.influence, # also can use permutation.test.gbm
las = 2
)
## var rel.inf
## WeeklyWages WeeklyWages 56.806250
## A.OCURRENCIA A.OCURRENCIA 17.789311
## Age Age 11.157837
## Gender.M Gender.M 4.074334
## HoursWorkedPerWeek HoursWorkedPerWeek 3.999910
## days_Report days_Report 3.506309
## MaritalStatus.S MaritalStatus.S 2.666050
## MaritalStatus.M MaritalStatus.M 0.000000
## DependentChildren DependentChildren 0.000000
## DependentsOther DependentsOther 0.000000
## PartTimeFullTime.F PartTimeFullTime.F 0.000000
## DaysWorkedPerWeek DaysWorkedPerWeek 0.000000
## M.OCURRENCIA M.OCURRENCIA 0.000000