LIBRERIAS

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(knitr)
library(dplyr)
library(readxl)
library(lubridate)
library(tidyr)
library(openxlsx)
library(stringr)
library(plotly)
library(RColorBrewer)
library(rpart.plot)
library(rpart)
library(caret)
library(stats)
library(nnet)
library(dummies)
library(randomForest)
library(skimr)
library(ipred)
library(gbm)
library(FNN)


library(rmdformats)
library(aplpack)
library(psych)
library(tidyr)
library(corrplot)
library(tibble)
library(kableExtra)
library(VIM)
library(mice)
library(car)
library(GGally)
library(devtools)
library(paletteer)

LECTURA DE LA BASE DE DATOS

colores= c("#CD6889","#528B8B", "#458B74","#838B8B","#53868B", "#8B475D", "#8B636C", "#8B8682", "#CD7054","#8B0000", "#E9967A", "#483D8B", "#2F4F4F","#F08080", "#A2B5CD", "#CDBE70", "#EEA2AD", "#B4CDCD")

BASE=read.csv("train_SJC.csv", stringsAsFactors = FALSE)
names(BASE)=c("ClaimNumber","DateTimeOfAccident", "DateReported",   "Age",  "Gender",   "MaritalStatus" ,   "DependentChildren" ,"DependentsOther"  ,   "WeeklyWages"   ,"PartTimeFullTime" ,"HoursWorkedPerWeek", "DaysWorkedPerWeek", "ClaimDescription", "InitialIncurredCalimsCost",    "UltimateIncurredClaimCost")
BASE=BASE[2:nrow(BASE),]


BASE = BASE %>% mutate(Age=as.numeric(Age),
                       DependentChildren=as.factor(as.numeric(DependentChildren)),
                       DependentsOther=as.factor(as.numeric(DependentsOther)),
                       DaysWorkedPerWeek=as.factor(as.numeric(DaysWorkedPerWeek)),
                       WeeklyWages=as.numeric(WeeklyWages),
                       HoursWorkedPerWeek=as.numeric(HoursWorkedPerWeek),
                       ClaimDescription=as.character(ClaimDescription),
                       InitialIncurredCalimsCost=as.numeric(InitialIncurredCalimsCost),
                       UltimateIncurredClaimCost=as.numeric(UltimateIncurredClaimCost),
                       DateTimeOfAccident=as.POSIXct(DateTimeOfAccident,
                                                     format="%Y-%m-%dT%H:%M:%S" ),
                       DateReported=as.POSIXct(DateReported, format="%Y-%m-%dT%H:%M:%S"), 
                       days_Report=round(as.numeric(difftime(DateReported,DateTimeOfAccident,
                                             units = "days")))) %>%  
  filter(days_Report>=0, 
         MaritalStatus != "",
         Gender!="U")

RESUMEN INICIAL

summary(BASE)
##  ClaimNumber        DateTimeOfAccident             DateReported                
##  Length:36146       Min.   :1988-01-01 09:00:00   Min.   :1988-01-10 00:00:00  
##  Class :character   1st Qu.:1992-07-03 12:15:00   1st Qu.:1992-08-10 00:00:00  
##  Mode  :character   Median :1996-12-28 07:00:00   Median :1997-02-10 00:00:00  
##                     Mean   :1996-12-30 15:58:37   Mean   :1997-02-07 06:58:21  
##                     3rd Qu.:2001-07-05 07:15:00   3rd Qu.:2001-08-20 18:00:00  
##                     Max.   :2005-12-31 10:00:00   Max.   :2006-09-23 00:00:00  
##                                                                                
##       Age           Gender          MaritalStatus      DependentChildren
##  Min.   :13.00   Length:36146       Length:36146       0      :33868    
##  1st Qu.:23.00   Class :character   Class :character   2      :  924    
##  Median :32.00   Mode  :character   Mode  :character   1      :  858    
##  Mean   :33.79                                         3      :  353    
##  3rd Qu.:43.00                                         4      :  103    
##  Max.   :79.00                                         5      :   34    
##                                                        (Other):    6    
##  DependentsOther  WeeklyWages     PartTimeFullTime   HoursWorkedPerWeek
##  0:35828         Min.   :   1.0   Length:36146       Min.   :  0.00    
##  1:  297         1st Qu.: 200.0   Class :character   1st Qu.: 38.00    
##  2:   15         Median : 393.3   Mode  :character   Median : 38.00    
##  3:    6         Mean   : 416.4                      Mean   : 37.77    
##                  3rd Qu.: 500.0                      3rd Qu.: 40.00    
##                  Max.   :7497.0                      Max.   :640.00    
##                  NA's   :56                          NA's   :49        
##  DaysWorkedPerWeek ClaimDescription   InitialIncurredCalimsCost
##  1:  119           Length:36146       Min.   :     1           
##  2:  336           Class :character   1st Qu.:   700           
##  3:  956           Mode  :character   Median :  2000           
##  4:  995                              Mean   :  7748           
##  5:32969                              3rd Qu.:  9500           
##  6:  563                              Max.   :830000           
##  7:  208                                                       
##  UltimateIncurredClaimCost  days_Report     
##  Min.   :    122           Min.   :   0.00  
##  1st Qu.:    926           1st Qu.:  13.00  
##  Median :   3376           Median :  22.00  
##  Mean   :  10956           Mean   :  38.68  
##  3rd Qu.:   8189           3rd Qu.:  40.00  
##  Max.   :4027136           Max.   :1088.00  
## 
str(BASE)
## 'data.frame':    36146 obs. of  16 variables:
##  $ ClaimNumber              : chr  "WC8205482" "WC6922469" "WC5442654" "WC9796897" ...
##  $ DateTimeOfAccident       : POSIXct, format: "2002-04-09 07:00:00" "1999-01-07 11:00:00" ...
##  $ DateReported             : POSIXct, format: "2002-07-05" "1999-01-20" ...
##  $ Age                      : num  48 43 30 41 36 50 39 56 49 30 ...
##  $ Gender                   : chr  "M" "F" "M" "M" ...
##  $ MaritalStatus            : chr  "M" "M" "U" "S" ...
##  $ DependentChildren        : Factor w/ 9 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DependentsOther          : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WeeklyWages              : num  500 509 709 555 377 ...
##  $ PartTimeFullTime         : chr  "F" "F" "F" "F" ...
##  $ HoursWorkedPerWeek       : num  38 37.5 38 38 38 38 38 40 38 37 ...
##  $ DaysWorkedPerWeek        : Factor w/ 7 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ ClaimDescription         : chr  "LIFTING TYRE INJURY TO RIGHT ARM AND WRIST INJURY" "STEPPED AROUND CRATES AND TRUCK TRAY FRACTURE LEFT FOREARM" "CUT ON SHARP EDGE CUT LEFT THUMB" "DIGGING LOWER BACK LOWER BACK STRAIN" ...
##  $ InitialIncurredCalimsCost: num  1500 5500 1700 15000 2800 500 500 500 925 1500 ...
##  $ UltimateIncurredClaimCost: num  4748 6326 2294 17786 4014 ...
##  $ days_Report              : num  87 13 20 29 29 80 10 38 32 15 ...
kbl(summary(BASE[,!sapply(BASE,is.character)])) %>%
  kable_paper(bootstrap_options = "striped", full_width = F)
DateTimeOfAccident DateReported Age DependentChildren DependentsOther WeeklyWages HoursWorkedPerWeek DaysWorkedPerWeek InitialIncurredCalimsCost UltimateIncurredClaimCost days_Report
Min. :1988-01-01 09:00:00 Min. :1988-01-10 00:00:00 Min. :13.00 0 :33868 0:35828 Min. : 1.0 Min. : 0.00 1: 119 Min. : 1 Min. : 122 Min. : 0.00
1st Qu.:1992-07-03 12:15:00 1st Qu.:1992-08-10 00:00:00 1st Qu.:23.00 2 : 924 1: 297 1st Qu.: 200.0 1st Qu.: 38.00 2: 336 1st Qu.: 700 1st Qu.: 926 1st Qu.: 13.00
Median :1996-12-28 07:00:00 Median :1997-02-10 00:00:00 Median :32.00 1 : 858 2: 15 Median : 393.3 Median : 38.00 3: 956 Median : 2000 Median : 3376 Median : 22.00
Mean :1996-12-30 15:58:37 Mean :1997-02-07 06:58:21 Mean :33.79 3 : 353 3: 6 Mean : 416.4 Mean : 37.77 4: 995 Mean : 7748 Mean : 10956 Mean : 38.68
3rd Qu.:2001-07-05 07:15:00 3rd Qu.:2001-08-20 18:00:00 3rd Qu.:43.00 4 : 103 NA 3rd Qu.: 500.0 3rd Qu.: 40.00 5:32969 3rd Qu.: 9500 3rd Qu.: 8189 3rd Qu.: 40.00
Max. :2005-12-31 10:00:00 Max. :2006-09-23 00:00:00 Max. :79.00 5 : 34 NA Max. :7497.0 Max. :640.00 6: 563 Max. :830000 Max. :4027136 Max. :1088.00
NA NA NA (Other): 6 NA NA’s :56 NA’s :49 7: 208 NA NA NA

Número de categorías por variable

sapply(BASE[,sapply(BASE,is.character)],n_distinct)
##      ClaimNumber           Gender    MaritalStatus PartTimeFullTime 
##            29438                2                3                2 
## ClaimDescription 
##            20581

NA’s ocultos en variables categóricas

sapply(select(BASE, c("MaritalStatus", "Gender", "PartTimeFullTime")),table)
## $MaritalStatus
## 
##     M     S     U 
## 15159 17448  3539 
## 
## $Gender
## 
##     F     M 
##  8250 27896 
## 
## $PartTimeFullTime
## 
##     F     P 
## 32887  3259

VALORES NA’S

aggr(BASE[,1:15],
     #col= c('blue', 'red'), #Cambiar colores de visualización 
     numbers = TRUE, #incorporar en que porcentajes nos encontramos
     sortVars = TRUE,#ordena las variables faltantes de mayor a menor 
     labels = names(BASE[,1:15]), #colocar todas las etiquetas de las variables  
     cex.axis = 0.5,#tamaño de la fuente de las ejes 
     gap = 1, #para que el "hueco" entre las graficas se disminulla  
     ylab = c("Histograma de NAs", "Patrón"),# nombre de los ejes y de los graficas se puede hacer lo mismo para el eje de las x con "xlab"

)     

## 
##  Variables sorted by number of missings: 
##                   Variable       Count
##                WeeklyWages 0.001549272
##         HoursWorkedPerWeek 0.001355613
##                ClaimNumber 0.000000000
##         DateTimeOfAccident 0.000000000
##               DateReported 0.000000000
##                        Age 0.000000000
##                     Gender 0.000000000
##              MaritalStatus 0.000000000
##          DependentChildren 0.000000000
##            DependentsOther 0.000000000
##           PartTimeFullTime 0.000000000
##          DaysWorkedPerWeek 0.000000000
##           ClaimDescription 0.000000000
##  InitialIncurredCalimsCost 0.000000000
##  UltimateIncurredClaimCost 0.000000000
BASE =BASE %>% mutate(A.OCURRENCIA=factor(as.numeric(format(DateTimeOfAccident,'%Y'))),
                      M.OCURRENCIA=factor(as.numeric(format(DateTimeOfAccident,'%m')))) %>% 
  select(-c("ClaimNumber", "DateReported", "ClaimDescription", "InitialIncurredCalimsCost","DateTimeOfAccident") ) %>% na.omit()
apply(X = is.na(BASE), MARGIN = 2, FUN = sum)
##                       Age                    Gender             MaritalStatus 
##                         0                         0                         0 
##         DependentChildren           DependentsOther               WeeklyWages 
##                         0                         0                         0 
##          PartTimeFullTime        HoursWorkedPerWeek         DaysWorkedPerWeek 
##                         0                         0                         0 
## UltimateIncurredClaimCost               days_Report              A.OCURRENCIA 
##                         0                         0                         0 
##              M.OCURRENCIA 
##                         0

DESCRIPCIÓN DE LA BASE DE DATOS

plot_ly(data = BASE,
        type = "box",
        y = ~UltimateIncurredClaimCost,name="Costo del siniestro"
        )
BASE=BASE %>% filter(UltimateIncurredClaimCost<max(UltimateIncurredClaimCost))
plot_ly(data = BASE,
        type = "box",
        y = ~UltimateIncurredClaimCost,  name="Costo del siniestro"
        )
summary(BASE)
##       Age           Gender          MaritalStatus      DependentChildren
##  Min.   :13.00   Length:36040       Length:36040       0      :33769    
##  1st Qu.:23.00   Class :character   Class :character   2      :  922    
##  Median :32.00   Mode  :character   Mode  :character   1      :  856    
##  Mean   :33.79                                         3      :  351    
##  3rd Qu.:43.00                                         4      :  102    
##  Max.   :79.00                                         5      :   34    
##                                                        (Other):    6    
##  DependentsOther  WeeklyWages     PartTimeFullTime   HoursWorkedPerWeek
##  0:35722         Min.   :   1.0   Length:36040       Min.   :  0.00    
##  1:  297         1st Qu.: 200.0   Class :character   1st Qu.: 38.00    
##  2:   15         Median : 393.3   Mode  :character   Median : 38.00    
##  3:    6         Mean   : 416.4                      Mean   : 37.77    
##                  3rd Qu.: 500.0                      3rd Qu.: 40.00    
##                  Max.   :7497.0                      Max.   :640.00    
##                                                                        
##  DaysWorkedPerWeek UltimateIncurredClaimCost  days_Report       A.OCURRENCIA  
##  1:  119           Min.   :   121.9          Min.   :   0.00   2001   : 2071  
##  2:  335           1st Qu.:   926.5          1st Qu.:  13.00   1994   : 2043  
##  3:  953           Median :  3373.1          Median :  22.00   2004   : 2028  
##  4:  992           Mean   : 10851.0          Mean   :  38.66   1997   : 2025  
##  5:32871           3rd Qu.:  8188.4          3rd Qu.:  40.00   1988   : 2024  
##  6:  562           Max.   :865770.6          Max.   :1088.00   1996   : 2018  
##  7:  208                                                       (Other):23831  
##   M.OCURRENCIA  
##  5      : 3285  
##  3      : 3177  
##  8      : 3135  
##  10     : 3125  
##  11     : 3093  
##  7      : 3077  
##  (Other):17148

DESCRIPCIÓN DE LOS DATOS

BASE2=BASE %>% select("Age","Gender","MaritalStatus" ,"DependentChildren", "DependentsOther", "WeeklyWages","PartTimeFullTime","HoursWorkedPerWeek" , "DaysWorkedPerWeek","A.OCURRENCIA","M.OCURRENCIA","UltimateIncurredClaimCost", "days_Report" ) %>% na.omit()


str(BASE2)
## 'data.frame':    36040 obs. of  13 variables:
##  $ Age                      : num  48 43 30 41 36 50 39 56 49 30 ...
##  $ Gender                   : chr  "M" "F" "M" "M" ...
##  $ MaritalStatus            : chr  "M" "M" "U" "S" ...
##  $ DependentChildren        : Factor w/ 9 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DependentsOther          : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ WeeklyWages              : num  500 509 709 555 377 ...
##  $ PartTimeFullTime         : chr  "F" "F" "F" "F" ...
##  $ HoursWorkedPerWeek       : num  38 37.5 38 38 38 38 38 40 38 37 ...
##  $ DaysWorkedPerWeek        : Factor w/ 7 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ A.OCURRENCIA             : Factor w/ 18 levels "1988","1989",..: 15 12 9 18 3 12 14 13 7 18 ...
##  $ M.OCURRENCIA             : Factor w/ 12 levels "1","2","3","4",..: 4 1 3 6 8 6 7 3 3 12 ...
##  $ UltimateIncurredClaimCost: num  4748 6326 2294 17786 4014 ...
##  $ days_Report              : num  87 13 20 29 29 80 10 38 32 15 ...
##  - attr(*, "na.action")= 'omit' Named int [1:105] 237 387 443 492 536 589 623 629 630 637 ...
##   ..- attr(*, "names")= chr [1:105] "237" "387" "443" "492" ...
 pairs(BASE2[,sapply(BASE2,is.numeric)],col ="cadetblue" ,
       pch=19, cex=0.5, upper.panel=NULL, cex.labels=1, main='Matriz de dispersión', las=1 )

pairs(BASE2[,sapply(BASE2,is.numeric)] %>% filter(HoursWorkedPerWeek<=100),col ="cadetblue" ,
       pch=19, cex=0.5, upper.panel=NULL, cex.labels=1, main='Matriz de dispersión las horas semanales trabajadas <100', las=1 )

corrplot.mixed(cor(BASE2[,sapply(BASE2,is.numeric)]), lower.col = "black", number.cex = .7,
               diag = "u", tl.pos="lt" )

columnas.num=as.vector(which(sapply(BASE2,is.numeric)))

fig_c=c()
for(i in 1:length(columnas.num)){
  BASE_FIGURAS=BASE2[,sapply(BASE2,is.numeric)]# %>% filter(HoursWorkedPerWeek<=100)
  nombre=names(BASE_FIGURAS[i])
  BASE_FIGURAS=rename(BASE_FIGURAS,i = colnames (BASE_FIGURAS) [i])
fig_c[i] <- BASE_FIGURAS %>% plot_ly(
    y = ~i,
    type = 'violin',
    box = list(
      visible = T
    ),
    meanline = list(
      visible = T
    ),
    x0 = nombre
  ) %>%
  layout(
    yaxis = list(
      title = "",
      zeroline = F

    )
  )

}
fig_c=Filter(Negate(is.null), fig_c)
subplot(fig_c, nrows = round(length(fig_c)/2,0))%>% 
  layout( title = list(text = "gráfico de violin"))
columnas.char=as.vector(which(sapply(BASE2,is.numeric)))


fig_c=c()
for(i in columnas.char){
    BASE_FIGURAS=BASE2
  BASE_FIGURAS=rename(BASE_FIGURAS,i = colnames (BASE_FIGURAS) [i])
  BASE_fig=BASE_FIGURAS %>% group_by(A.OCURRENCIA) %>% summarise(promedio= round(mean(i, na.rm = T),0))
fig_c[i]= plot_ly(BASE_fig,
             x =~A.OCURRENCIA, y =~promedio, type = 'scatter', mode = 'lines',
             name = names(BASE2)[i],
             line = list(color = colores[i], width = 2), text =~promedio, textposition = 'auto' ) %>%
  layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'),  barmode = 'group')
}

fig_c=Filter(Negate(is.null), fig_c)
subplot(fig_c, nrows = round(length(fig_c)/2,0))%>% 
  layout( title = list(text = "Promedio por años de ocurrencia"))
columnas.char=as.vector(which(sapply(BASE2,is.numeric)))
columnas.char=columnas.char[-1]

fig_c=c()
for(i in columnas.char){
    BASE_FIGURAS=BASE2
  BASE_FIGURAS=rename(BASE_FIGURAS,i = colnames (BASE_FIGURAS) [i])
  BASE_fig=BASE_FIGURAS %>% group_by(Age) %>% summarise(promedio= round(mean(i, na.rm = T),0))
fig_c[i]= plot_ly(BASE_fig,
             x =~Age, y =~promedio, type = 'scatter', mode = 'lines',
             name = names(BASE2)[i],
             line = list(color = colores[i], width = 2), text =~promedio, textposition = 'auto' ) %>%
  layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'),  barmode = 'group')
}

fig_c=Filter(Negate(is.null), fig_c)
subplot(fig_c, nrows = round(length(fig_c)/2,0))%>% 
  layout( title = list(text = "Promedio por edad"))
columnas.char=as.vector(which(sapply(BASE2,is.numeric)))
columnas.char=columnas.char[-1]

fig_c=c()
for(i in columnas.char){
    BASE_FIGURAS=BASE2
  BASE_FIGURAS=rename(BASE_FIGURAS,i = colnames (BASE_FIGURAS) [i])
  BASE_fig=BASE_FIGURAS %>% group_by(M.OCURRENCIA) %>% summarise(promedio= round(mean(i, na.rm = T),0))
fig_c[i]= plot_ly(BASE_fig,
             x =~M.OCURRENCIA, y =~promedio, type = 'scatter', mode = 'lines',
             name = names(BASE2)[i],
             line = list(color = colores[i], width = 2), text =~promedio, textposition = 'auto' ) %>%
  layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'),  barmode = 'group')
}

fig_c=Filter(Negate(is.null), fig_c)
subplot(fig_c, nrows = round(length(fig_c)/2,0))%>% 
  layout( title = list(text = "Promedio por meses de ocurrencia"))
BASE_FIGURAS=BASE2[,sapply(BASE2,is.numeric)] %>% select(-c("UltimateIncurredClaimCost"))

fig_c=c()
for(i in 1:ncol(BASE_FIGURAS)){
    BASE_fig=BASE2 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(UltimateIncurredClaimCost),0))
  BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[i]= plot_ly(BASE_fig,
             x =~i, y =~promedio, type = 'scatter', mode = 'lines',
             name = names(BASE_FIGURAS)[i],
             line = list(color = colores[length(fig_c)+6], width = 2), text =~promedio, textposition = 'auto' ) %>%
  layout(title=names(BASE_FIGURAS)[i])
}

subplot(fig_c, nrows = round(length(fig_c)/2,0))%>% 
  layout( title = list(text = "Costo medio de las reclamaciones"))
BASE_FIGURAS=BASE2[,!sapply(BASE2,is.numeric)]

fig_c=c()
for(i in 1:ncol(BASE_FIGURAS)){
  BASE_fig=as.data.frame(table(BASE_FIGURAS[,i]))
  BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[i]= plot_ly(BASE_fig,
             x =~i, y =~Freq, type = 'bar', name = names(BASE_FIGURAS)[i],
             marker = list(color =colores), text =~Freq, textposition = 'auto' ) %>%
  layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'),  barmode = 'group')
}

subplot(fig_c, nrows = round(length(fig_c)/3,0))%>% 
  layout( title = list(text = "Gráfico de frecuencias"))
BASE_FIGURAS=BASE2[,!sapply(BASE2,is.numeric)]
BASE_FIGURAS=BASE_FIGURAS[,-c(ncol(BASE_FIGURAS),ncol(BASE_FIGURAS)-1)]


fig_c=c()
for(i in 1:ncol(BASE_FIGURAS)){
  BASE_fig=BASE2 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(UltimateIncurredClaimCost),0))
  BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[i]= plot_ly(BASE_fig,
             x =~i, y =~promedio, type = 'bar', name = names(BASE_FIGURAS)[i],
             marker = list(color =colores), text =~promedio, textposition = 'auto' ) %>%
  layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'),  barmode = 'group')
}

BASE_FIGURAS=BASE2[,!sapply(BASE2,is.numeric)]
BASE_FIGURAS=BASE_FIGURAS[,c(ncol(BASE_FIGURAS),ncol(BASE_FIGURAS)-1)]

for(i in 1:ncol(BASE_FIGURAS)){
    BASE_fig=BASE2 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(UltimateIncurredClaimCost),0))
  BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[length(fig_c)+1]= plot_ly(BASE_fig,
             x =~i, y =~promedio, type = 'scatter', mode = 'lines',
             name = names(BASE_FIGURAS)[i],
             line = list(color = colores[length(fig_c)+6], width = 2), text =~promedio, textposition = 'auto' ) %>%
  layout(title=names(BASE_FIGURAS)[i])
}




subplot(fig_c, nrows = round(length(fig_c)/3,0))%>% 
  layout( title = list(text = "Costo medio de las reclamaciones"))
BASE_FIGURAS=BASE2[,!sapply(BASE2,is.numeric)]

fig_c=c()
for(i in 1:ncol(BASE_FIGURAS)){
  BASE_fig=BASE2 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(WeeklyWages),0))
  BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[i]= plot_ly(BASE_fig,
             x =~i, y =~promedio, type = 'bar', name = names(BASE_FIGURAS)[i],
             marker = list(color =colores), text =~promedio, textposition = 'auto' ) %>%
  layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'),  barmode = 'group')
}


subplot(fig_c, nrows = round(length(fig_c)/3,0))%>% 
  layout( title = list(text = "Salario promedio semanal"))

BASES PARA ESTUDIO

BASE3=BASE2 %>% select("Age","Gender","MaritalStatus" ,"DependentChildren", "DependentsOther", "WeeklyWages","PartTimeFullTime","HoursWorkedPerWeek" , "DaysWorkedPerWeek","A.OCURRENCIA","M.OCURRENCIA","UltimateIncurredClaimCost", "days_Report" ) %>% filter( HoursWorkedPerWeek<=112, DependentChildren %in% c(0:6) ) %>%
  mutate(A.OCURRENCIA=as.numeric(as.character(A.OCURRENCIA)),
         M.OCURRENCIA=as.numeric(M.OCURRENCIA),
         DaysWorkedPerWeek=as.numeric(DaysWorkedPerWeek), 
         DependentChildren=as.numeric(DependentChildren), 
        DependentsOther=as.numeric(DependentsOther))


BASE3=dummy.data.frame(BASE3,names = c("Gender"), sep=".")
BASE3=dummy.data.frame(BASE3,names = c("MaritalStatus"), sep=".")
BASE3=dummy.data.frame(BASE3,names = c("PartTimeFullTime"), sep="."  )

BASE3=BASE3 %>% select(-c("PartTimeFullTime.P","MaritalStatus.U", "Gender.F"))


columnas.num=as.vector(which(names(BASE3) %in% names(BASE2[,sapply(BASE2,is.numeric)])))

kable(summary(BASE3[,columnas.num]))
Age WeeklyWages HoursWorkedPerWeek UltimateIncurredClaimCost days_Report
Min. :13.00 Min. : 1.0 Min. : 0.00 Min. : 121.9 Min. : 0.00
1st Qu.:23.00 1st Qu.: 200.0 1st Qu.:38.00 1st Qu.: 925.7 1st Qu.: 13.00
Median :32.00 Median : 393.3 Median :38.00 Median : 3372.9 Median : 22.00
Mean :33.79 Mean : 416.4 Mean :37.49 Mean : 10838.7 Mean : 38.63
3rd Qu.:43.00 3rd Qu.: 500.0 3rd Qu.:40.00 3rd Qu.: 8190.0 3rd Qu.: 40.00
Max. :79.00 Max. :7497.0 Max. :93.00 Max. :865770.6 Max. :1088.00

DESCRICIÓN BASE PARA EL ESTUDIO

columnas.num=as.vector(which(names(BASE3) %in% names(BASE2[,sapply(BASE2,is.numeric)])))

fig_c=c()
for(i in 1:length(columnas.num)){
  BASE_FIGURAS=as.data.frame(BASE3[,columnas.num])# %>% filter(HoursWorkedPerWeek<=100)
  nombre=names(BASE_FIGURAS[i])
  BASE_FIGURAS=rename(BASE_FIGURAS,i = colnames (BASE_FIGURAS) [i])
fig_c[i] <-BASE_FIGURAS %>%  plot_ly(
    y = ~i,
    type = 'violin',
    box = list(
      visible = T
    ),
    meanline = list(
      visible = T
    ),
    x0 = nombre
  ) %>%
  layout(
    yaxis = list(
      title = "",
      zeroline = F
    )
  )

}
fig_c=Filter(Negate(is.null), fig_c)
subplot(fig_c, nrows =2)%>% 
  layout( title = list(text = ""))
BASE_FIGURAS=BASE2 %>% select("Age","Gender","MaritalStatus" ,"DependentChildren", "DependentsOther", "WeeklyWages","PartTimeFullTime","HoursWorkedPerWeek" , "DaysWorkedPerWeek","A.OCURRENCIA","M.OCURRENCIA","UltimateIncurredClaimCost", "days_Report" ) %>% filter( HoursWorkedPerWeek<=105, DependentChildren %in% c(0:6) )
BASE_FIGURAS=BASE_FIGURAS[,!sapply(BASE_FIGURAS,is.numeric)]
BASE_FIGURAS=BASE_FIGURAS[,-c(ncol(BASE_FIGURAS),ncol(BASE_FIGURAS)-1)]
fig_c=c()
for(i in 1:ncol(BASE_FIGURAS)){
  BASE_fig=BASE3 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(UltimateIncurredClaimCost),0))
  BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[i]= plot_ly(BASE_fig,
             x =~i, y =~promedio, type = 'bar', name = names(BASE_FIGURAS)[i],
             marker = list(color =colores), text =~promedio, textposition = 'auto' ) %>%
  layout(title=names(BASE_FIGURAS)[i], yaxis = list(title = 'Conteo'),  barmode = 'group')
}


BASE_FIGURAS=BASE3 %>% select("A.OCURRENCIA","M.OCURRENCIA")

for(i in 1:ncol(BASE_FIGURAS)){
    BASE_fig=BASE3 %>% group_by(BASE_FIGURAS[i]) %>% summarise(promedio= round(mean(UltimateIncurredClaimCost),0))
  BASE_fig=rename(BASE_fig,i = colnames (BASE_fig) [1])
fig_c[length(fig_c)+1]= plot_ly(BASE_fig,
             x =~i, y =~promedio, type = 'scatter', mode = 'lines',
             name = names(BASE_FIGURAS)[i],
             line = list(color = colores[length(fig_c)+6], width = 2), text =~promedio, textposition = 'auto' ) %>%
  layout(title=names(BASE_FIGURAS)[i])
}


names(BASE3)
##  [1] "Age"                       "Gender.M"                 
##  [3] "MaritalStatus.M"           "MaritalStatus.S"          
##  [5] "DependentChildren"         "DependentsOther"          
##  [7] "WeeklyWages"               "PartTimeFullTime.F"       
##  [9] "HoursWorkedPerWeek"        "DaysWorkedPerWeek"        
## [11] "A.OCURRENCIA"              "M.OCURRENCIA"             
## [13] "UltimateIncurredClaimCost" "days_Report"
subplot(fig_c, nrows = round(length(fig_c)/3,0))%>% 
  layout( title = list(text = "Costo medio de las reclamaciones"))

BASE NORMALIZADA

BASE.Z=BASE3 %>% mutate(WeeklyWages= as.vector(scale(WeeklyWages)),
                        HoursWorkedPerWeek=as.vector(scale(HoursWorkedPerWeek)),
                        DaysWorkedPerWeek=as.vector(scale(DaysWorkedPerWeek)),
                        UltimateIncurredClaimCost=as.vector(scale(UltimateIncurredClaimCost)),
                        days_Report=as.vector(scale(days_Report))
                        )
columnas.num=as.vector(which(names(BASE3) %in% names(BASE2[,sapply(BASE2,is.numeric)])))

kable(summary(BASE.Z[,columnas.num]))
Age WeeklyWages HoursWorkedPerWeek UltimateIncurredClaimCost days_Report
Min. :13.00 Min. :-1.70277 Min. :-5.36174 Min. :-0.37966 Min. :-0.64163
1st Qu.:23.00 1st Qu.:-0.88709 1st Qu.: 0.07275 1st Qu.:-0.35119 1st Qu.:-0.42570
Median :32.00 Median :-0.09477 Median : 0.07275 Median :-0.26449 Median :-0.27621
Mean :33.79 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
3rd Qu.:43.00 3rd Qu.: 0.34258 3rd Qu.: 0.35877 3rd Qu.:-0.09384 3rd Qu.: 0.02276
Max. :79.00 Max. :29.02254 Max. : 7.93845 Max. :30.28758 Max. :17.42971

BASE SIN OUTLIERS

OUT=quantile(BASE3$UltimateIncurredClaimCost, 0.75)+1.5*IQR(BASE3$UltimateIncurredClaimCost)
BASE_NOT_OUT=BASE3 %>% filter(UltimateIncurredClaimCost<=OUT)

columnas.num=as.vector(which(names(BASE3) %in% names(BASE2[,sapply(BASE2,is.numeric)])))

kable(summary(BASE_NOT_OUT[,columnas.num]))
Age WeeklyWages HoursWorkedPerWeek UltimateIncurredClaimCost days_Report
Min. :13.00 Min. : 1.0 Min. : 0.00 Min. : 121.9 Min. : 0.00
1st Qu.:23.00 1st Qu.: 200.0 1st Qu.:38.00 1st Qu.: 789.4 1st Qu.: 13.00
Median :31.00 Median : 376.0 Median :38.00 Median : 2424.9 Median : 22.00
Mean :33.16 Mean : 396.8 Mean :37.46 Mean : 4011.1 Mean : 37.43
3rd Qu.:42.00 3rd Qu.: 500.0 3rd Qu.:40.00 3rd Qu.: 6063.7 3rd Qu.: 40.00
Max. :79.00 Max. :7497.0 Max. :93.00 Max. :19082.4 Max. :1088.00

#PARTICIÓN DE BASES

CONJUNTO DE ENTRENAMIENTO Y VALIDACIÓN

set.seed(2021)
entrenamiento=createDataPartition(BASE3$UltimateIncurredClaimCost, p=0.7, list = F)
#Escenario 1 y 2 
BASE_ENTRENAMIENTO3=BASE3[entrenamiento,]
BASE_VAL_3=BASE3[-entrenamiento,]
#Escenario 3 
BASE_ENTRENAMIENTO.Z=BASE.Z[entrenamiento,]
BASE_VAL_Z=BASE.Z[-entrenamiento,]
#Escenario 4
set.seed(2021)
entrenamiento=createDataPartition(BASE_NOT_OUT$UltimateIncurredClaimCost, p=0.7, list = F)
BASE_ENTRENAMIENTO.NOT_OUT=BASE_NOT_OUT[entrenamiento,]
BASE_VAL_NOT_OUT=BASE_NOT_OUT[-entrenamiento,]

MODELOS DE REGRESIÓN

MODELO LINEAL

Escenario 1

mod= lm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3 )

plot(mod)

summary(mod)
## 
## Call:
## lm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -103030   -9190   -4505     423  859834 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -1.557e+06  7.207e+04 -21.596  < 2e-16 ***
## Age                 1.883e+02  1.734e+01  10.864  < 2e-16 ***
## Gender.M           -1.679e+03  4.341e+02  -3.868  0.00011 ***
## MaritalStatus.M    -3.964e+03  6.338e+02  -6.254 4.06e-10 ***
## MaritalStatus.S    -4.400e+03  6.287e+02  -6.999 2.64e-12 ***
## DependentChildren   1.004e+03  3.531e+02   2.845  0.00444 ** 
## DependentsOther     1.069e+04  1.734e+03   6.163 7.25e-10 ***
## WeeklyWages         1.498e+01  8.266e-01  18.126  < 2e-16 ***
## PartTimeFullTime.F -2.580e+03  8.486e+02  -3.040  0.00236 ** 
## HoursWorkedPerWeek -2.780e+01  3.760e+01  -0.739  0.45966    
## DaysWorkedPerWeek  -1.071e+02  4.638e+02  -0.231  0.81744    
## A.OCURRENCIA        7.768e+02  3.608e+01  21.527  < 2e-16 ***
## M.OCURRENCIA        7.204e+01  5.110e+01   1.410  0.15859    
## days_Report         1.687e+01  2.937e+00   5.745 9.30e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27670 on 25197 degrees of freedom
## Multiple R-squared:  0.06576,    Adjusted R-squared:  0.06528 
## F-statistic: 136.4 on 13 and 25197 DF,  p-value: < 2.2e-16

Error

pred= as.vector(predict(mod,BASE_VAL_3))
real=as.vector(BASE_VAL_3$UltimateIncurredClaimCost)

#MAE    
mean(abs(pred-real),na.rm = TRUE)
## [1] 11393.7
#R-Cuadrado
(1-(sum((real-pred)^2)/sum((real-mean(real))^2)))*100
## [1] 6.034909

Escenario 2

POCO_SIG=as.vector(which(names(BASE_ENTRENAMIENTO3) %in% c("Age","MaritalStatus.M","MaritalStatus.S","DependentChildren",
                  "DependentsOther", "WeeklyWages", "PartTimeFullTime.F",
                  "A.OCURRENCIA", "days_Report", "UltimateIncurredClaimCost")))

mod2= lm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3[,POCO_SIG])

plot(mod2)

summary(mod2)
## 
## Call:
## lm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3[, 
##     POCO_SIG])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -100544   -9186   -4538     433  859561 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -1.565e+06  7.198e+04 -21.737  < 2e-16 ***
## Age                 1.924e+02  1.728e+01  11.135  < 2e-16 ***
## MaritalStatus.M    -3.943e+03  6.336e+02  -6.223 4.95e-10 ***
## MaritalStatus.S    -4.386e+03  6.284e+02  -6.980 3.03e-12 ***
## DependentChildren   9.962e+02  3.531e+02   2.822  0.00478 ** 
## DependentsOther     1.056e+04  1.734e+03   6.087 1.16e-09 ***
## WeeklyWages         1.455e+01  7.929e-01  18.347  < 2e-16 ***
## PartTimeFullTime.F -3.673e+03  6.248e+02  -5.878 4.20e-09 ***
## A.OCURRENCIA        7.803e+02  3.605e+01  21.642  < 2e-16 ***
## days_Report         1.726e+01  2.936e+00   5.877 4.23e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27680 on 25201 degrees of freedom
## Multiple R-squared:  0.06506,    Adjusted R-squared:  0.06473 
## F-statistic: 194.9 on 9 and 25201 DF,  p-value: < 2.2e-16

Error

pred= as.vector(predict(mod2,BASE_VAL_3[,POCO_SIG]))
real=as.vector(BASE_VAL_3$UltimateIncurredClaimCost)

#MAE
mean(abs(pred-real),na.rm = TRUE)
## [1] 11401.28
#R-Cuadrado
(1-(sum((real-pred)^2)/sum((real-mean(real))^2)))*100
## [1] 5.992989

Escenario 3

mod= lm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.Z )

plot(mod)

summary(mod)
## 
## Call:
## lm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.Z)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6500 -0.3256 -0.1596  0.0150 30.4612 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -5.534e+01  2.556e+00 -21.648  < 2e-16 ***
## Age                 6.672e-03  6.142e-04  10.864  < 2e-16 ***
## Gender.M           -5.949e-02  1.538e-02  -3.868  0.00011 ***
## MaritalStatus.M    -1.404e-01  2.245e-02  -6.254 4.06e-10 ***
## MaritalStatus.S    -1.559e-01  2.227e-02  -6.999 2.64e-12 ***
## DependentChildren   3.559e-02  1.251e-02   2.845  0.00444 ** 
## DependentsOther     3.786e-01  6.143e-02   6.163 7.25e-10 ***
## WeeklyWages         1.295e-01  7.145e-03  18.126  < 2e-16 ***
## PartTimeFullTime.F -9.141e-02  3.006e-02  -3.040  0.00236 ** 
## HoursWorkedPerWeek -6.886e-03  9.313e-03  -0.739  0.45966    
## DaysWorkedPerWeek  -2.074e-03  8.985e-03  -0.231  0.81744    
## A.OCURRENCIA        2.752e-02  1.278e-03  21.527  < 2e-16 ***
## M.OCURRENCIA        2.552e-03  1.810e-03   1.410  0.15859    
## days_Report         3.599e-02  6.264e-03   5.745 9.30e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9803 on 25197 degrees of freedom
## Multiple R-squared:  0.06576,    Adjusted R-squared:  0.06528 
## F-statistic: 136.4 on 13 and 25197 DF,  p-value: < 2.2e-16

Error

pred= as.vector(predict(mod,BASE_VAL_Z))
  
estand_pred=pred*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
estand_real=BASE_VAL_Z$UltimateIncurredClaimCost*sd(BASE3$UltimateIncurredClaimCost)+
  mean(BASE3$UltimateIncurredClaimCost)

#MAE
mean(abs(estand_pred-estand_real), na.rm = TRUE)
## [1] 11393.7
#R-Cuadrado

(1-(sum((estand_real-estand_pred)^2)/sum((estand_real-mean(estand_real))^2)))*100
## [1] 6.034909

Escenario 4

mod= lm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.NOT_OUT)

plot(mod)

summary(mod)
## 
## Call:
## lm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.NOT_OUT)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29015  -2705  -1229   2026  16480 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -1.982e+05  1.085e+04 -18.261  < 2e-16 ***
## Age                 3.527e+01  2.615e+00  13.489  < 2e-16 ***
## Gender.M           -7.310e+02  6.542e+01 -11.174  < 2e-16 ***
## MaritalStatus.M    -1.052e+03  9.783e+01 -10.752  < 2e-16 ***
## MaritalStatus.S    -1.338e+03  9.654e+01 -13.861  < 2e-16 ***
## DependentChildren   1.087e+01  5.378e+01   0.202  0.83986    
## DependentsOther     8.105e+02  2.668e+02   3.037  0.00239 ** 
## WeeklyWages         3.882e+00  1.238e-01  31.362  < 2e-16 ***
## PartTimeFullTime.F -6.519e+02  1.302e+02  -5.008 5.53e-07 ***
## HoursWorkedPerWeek -1.111e+01  5.820e+00  -1.908  0.05637 .  
## DaysWorkedPerWeek   7.490e+01  7.077e+01   1.058  0.28989    
## A.OCURRENCIA        1.007e+02  5.433e+00  18.530  < 2e-16 ***
## M.OCURRENCIA       -1.281e+00  7.638e+00  -0.168  0.86679    
## days_Report         1.575e+00  4.791e-01   3.289  0.00101 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3862 on 22042 degrees of freedom
## Multiple R-squared:  0.1141, Adjusted R-squared:  0.1136 
## F-statistic: 218.4 on 13 and 22042 DF,  p-value: < 2.2e-16

Error

pred= as.vector(predict(mod,BASE_VAL_NOT_OUT))
real=BASE_VAL_NOT_OUT$UltimateIncurredClaimCost

#MAE
mean(abs(pred-real), na.rm = TRUE)
## [1] 3000.209
#R-Cuadrado

(1-(sum((real-pred)^2)/sum((real-mean(real))^2)))*100
## [1] 11.91927

ARBOLES DE REGRESIÓN

Escenario 1

mod_tree=rpart(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3)

prp(mod_tree, type = 2,nn = T, fallen.leaves = T,
    faclen = 4, varlen = 8, shadow.col = "gray")

mod_tree
## n= 25211 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 25211 2.065161e+13 10910.540  
##   2) A.OCURRENCIA< 1999.5 16822 7.045954e+12  6975.721 *
##   3) A.OCURRENCIA>=1999.5 8389 1.282294e+13 18800.810  
##     6) WeeklyWages< 523.075 5386 5.077628e+12 13266.120 *
##     7) WeeklyWages>=523.075 3003 7.284409e+12 28727.490 *
mod_tree$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.03790115      0 1.0000000 1.0001181 0.07211989
## 2 0.02231791      1 0.9620988 0.9625563 0.07095114
## 3 0.01000000      2 0.9397809 0.9417381 0.07045817

Error

pred_tree=as.vector(predict(mod_tree,BASE_VAL_3))
real=BASE_VAL_3$UltimateIncurredClaimCost

#MAE
mean(abs(pred_tree-real),na.rm = TRUE)
## [1] 11264.64
#R-Cuadrado
(1-(sum((real-pred_tree)^2)/sum((real-mean(real))^2)))*100
## [1] 6.157961

Escenario 2

mod_tree=rpart(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3[,POCO_SIG])

prp(mod_tree, type = 2,nn = T, fallen.leaves = T,
    faclen = 4, varlen = 8, shadow.col = "gray")

mod_tree$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.03790115      0 1.0000000 1.0000951 0.07212042
## 2 0.02231791      1 0.9620988 0.9622218 0.07093867
## 3 0.01000000      2 0.9397809 0.9420644 0.07044865

Error

pred_tree=as.vector(predict(mod_tree,BASE_VAL_3[,POCO_SIG]))
real=BASE_VAL_3$UltimateIncurredClaimCost 

#MAE
mean(abs(pred_tree-real),na.rm = TRUE)
## [1] 11264.64
#R-Cuadrado

(1-(sum((real-pred_tree)^2)/sum((real-mean(real))^2)))*100
## [1] 6.157961

Escenario 3

mod_tree=rpart(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.Z)

prp(mod_tree, type = 2,nn = T, fallen.leaves = T,
    faclen = 4, varlen = 8, shadow.col = "gray")

mod_tree$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.03790115      0 1.0000000 1.0000794 0.07211553
## 2 0.02231791      1 0.9620988 0.9623013 0.07094538
## 3 0.01000000      2 0.9397809 0.9417319 0.07044848

Error

pred_tree=as.vector(predict(mod_tree,BASE_VAL_Z))

estand_pred=pred_tree*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
estand_real=BASE_VAL_Z$UltimateIncurredClaimCost*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
#MAE
mean(abs(estand_pred-estand_real), na.rm = TRUE)
## [1] 11264.64
#R-Cuadrado

(1-(sum((estand_real-estand_pred)^2)/sum((estand_real-mean(estand_real))^2)))*100
## [1] 6.157961

Escenario 4

mod_tree=rpart(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.NOT_OUT)

prp(mod_tree, type = 2,nn = T, fallen.leaves = T,
    faclen = 4, varlen = 8, shadow.col = "gray")

mod_tree$cptable
##           CP nsplit rel error    xerror       xstd
## 1 0.06507587      0 1.0000000 1.0000816 0.01284651
## 2 0.01666612      1 0.9349241 0.9363668 0.01220443
## 3 0.01565740      2 0.9182580 0.9255196 0.01212229
## 4 0.01103526      3 0.9026006 0.9041626 0.01192961
## 5 0.01000000      5 0.8805301 0.8864486 0.01169398

Error

pred_tree=as.vector(predict(mod_tree,BASE_VAL_NOT_OUT))
real=BASE_VAL_NOT_OUT$UltimateIncurredClaimCost
# MAE
mean(abs(pred_tree-real), na.rm = TRUE)
## [1] 3012.907
#R-Cuadrado

(1-(sum((real-pred_tree)^2)/sum((real-mean(real))^2)))*100
## [1] 10.82735

REDES NEURONALES

Escenario 1

set.seed(2021)

mod_Neuronal= nnet(UltimateIncurredClaimCost~., data=BASE_ENTRENAMIENTO.NEURONAL,
                   size=14, decay=0.1, maxit=10000, linout=T)
source_url("https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r")
plot(mod_Neuronal, max.sp = T)

Para que el modelo de red neuronal converja necesita 1.040 ieraciones,

Error

maximo=max(BASE3$UltimateIncurredClaimCost)
pred_NEU= as.vector(predict(mod_Neuronal, BASE_VAL_MEURONAL))*maximo
real=BASE_VAL_MEURONAL$UltimateIncurredClaimCost*maximo

#MAE
mean(abs(pred_NEU-real) ,na.rm = TRUE)
## [1] 11298.16
#R-Cuadrado
(1-(sum((real-pred_NEU)^2)/sum((real-mean(real))^2)))*100
## [1] 5.495078

Escenario 2

set.seed(2021)
mod_Neuronal= nnet(UltimateIncurredClaimCost/maximo ~., data=BASE_ENTRENAMIENTO3[,POCO_SIG],
                   size=6, decay=0.1, maxit=10000, linout=T)
source_url("https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r")
plot(mod_Neuronal, max.sp = T)

Error

pred_NEU= as.vector(predict(mod_Neuronal, BASE_VAL_3[,POCO_SIG]))*maximo
real=BASE_VAL_3$UltimateIncurredClaimCost

#MAE
mean(abs(pred_NEU-real),na.rm = TRUE)
## [1] 11364.42
#R-Cuadrado
(1-(sum((real-pred_NEU)^2)/sum((real-mean(real))^2)))*100
## [1] 5.29271

Escenario 3

set.seed(2021)
mod_Neuronal= nnet(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.Z,
                   size=6, decay=0.1, maxit=1000, linout=T)

Error

pred_NEU= as.vector(predict(mod_Neuronal, BASE_VAL_Z))

estand_pred=pred_NEU*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
estand_real=BASE_VAL_Z$UltimateIncurredClaimCost*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)

#MAE
mean(abs(estand_pred-estand_real), na.rm = TRUE)
## [1] 11247.53
#R-Cuadrado

(1-(sum((estand_real-estand_pred)^2)/sum((estand_real-mean(estand_real))^2)))*100
## [1] 5.485392
plot(mod_Neuronal, max.sp = T)

Escenario 4

set.seed(2021)
mod_Neuronal= nnet(UltimateIncurredClaimCost/maximo ~., data=BASE_ENTRENAMIENTO.NOT_OUT,
                   size=6, decay=0.1, maxit=10000, linout=T)
plot(mod_Neuronal, max.sp = T)

Error

pred_NEU= as.vector(predict(mod_Neuronal, BASE_VAL_NOT_OUT))*maximo
real= BASE_VAL_NOT_OUT$UltimateIncurredClaimCost

#MAE
mean(abs(pred_NEU-real), na.rm = TRUE)
## [1] 2974.487
#R-Cuadrado

(1-(sum((real-pred_NEU)^2)/sum((real-mean(real))^2)))*100
## [1] 11.76277

BAGGING

Escenario 1

set.seed(2021)
baggig.fit= bagging(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3)


baggig.fit
## 
## Bagging regression trees with 25 bootstrap replications 
## 
## Call: bagging.data.frame(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3)

Error

pred_bagging=as.vector(predict(baggig.fit,BASE_VAL_3))
real=BASE_VAL_3$UltimateIncurredClaimCost

#MAE
mean(abs(pred_bagging-real), na.rm = TRUE)
## [1] 11234.32
#R-Cuadrado
(1-(sum((real-pred_bagging)^2)/sum((real-mean(real))^2)))*100
## [1] 6.412791
#calculate variable importance
VI <- data.frame(var= row.names(varImp(baggig.fit)), imp=varImp(baggig.fit))

#sort variable importance descending
VI_plot <- VI[order(VI$Overall, decreasing=TRUE),]
kable(VI_plot)
var Overall
WeeklyWages WeeklyWages 0.0994027
A.OCURRENCIA A.OCURRENCIA 0.0495111
Age Age 0.0441905
days_Report days_Report 0.0230375
MaritalStatus.S MaritalStatus.S 0.0187173
DependentsOther DependentsOther 0.0081145
HoursWorkedPerWeek HoursWorkedPerWeek 0.0062675
M.OCURRENCIA M.OCURRENCIA 0.0036592
DependentChildren DependentChildren 0.0027026
MaritalStatus.M MaritalStatus.M 0.0022443
DaysWorkedPerWeek DaysWorkedPerWeek 0.0008063
Gender.M Gender.M 0.0003364
PartTimeFullTime.F PartTimeFullTime.F 0.0002007

Escenario 2

set.seed(2021)

baggig.fit= bagging(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3[,POCO_SIG])

baggig.fit
## 
## Bagging regression trees with 25 bootstrap replications 
## 
## Call: bagging.data.frame(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3[, 
##     POCO_SIG])

Error

pred_bagging=as.vector(predict(baggig.fit,BASE_VAL_3[,POCO_SIG]))
real=BASE_VAL_3$UltimateIncurredClaimCost

#MAE
mean(abs(pred_bagging-real), na.rm = TRUE)
## [1] 11234.34
#R-Cuadrado
(1-(sum((real-pred_bagging)^2)/sum((real-mean(real))^2)))*100
## [1] 6.376642
#calculate variable importance
VI <- data.frame(var= row.names(varImp(baggig.fit)), imp=varImp(baggig.fit))

#sort variable importance descending
VI_plot <- VI[order(VI$Overall, decreasing=TRUE),]
kable(VI_plot)
var Overall
WeeklyWages WeeklyWages 0.1110390
A.OCURRENCIA A.OCURRENCIA 0.0497348
Age Age 0.0465381
days_Report days_Report 0.0222232
MaritalStatus.S MaritalStatus.S 0.0197420
DependentsOther DependentsOther 0.0086510
DependentChildren DependentChildren 0.0030683
MaritalStatus.M MaritalStatus.M 0.0012196
PartTimeFullTime.F PartTimeFullTime.F 0.0004726

Escenario 3

set.seed(2021)
baggig.fit= bagging(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.Z)

baggig.fit
## 
## Bagging regression trees with 25 bootstrap replications 
## 
## Call: bagging.data.frame(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.Z)

Error

pred_bagging=as.vector(predict(baggig.fit,BASE_VAL_Z))
estand_pred=pred_bagging*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
estand_real=BASE_VAL_Z$UltimateIncurredClaimCost*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)

#MAE
mean(abs(estand_pred-estand_real), na.rm = TRUE)
## [1] 11234.32
#R-Cuadrado

(1-(sum((estand_real-estand_pred)^2)/sum((estand_real-mean(estand_real))^2)))*100
## [1] 6.412791
#calculate variable importance
VI <- data.frame(var= row.names(varImp(baggig.fit)), imp=varImp(baggig.fit))

#sort variable importance descending
VI_plot <- VI[order(VI$Overall, decreasing=TRUE),]
kable(VI_plot)
var Overall
WeeklyWages WeeklyWages 0.0994027
A.OCURRENCIA A.OCURRENCIA 0.0495111
Age Age 0.0441905
days_Report days_Report 0.0230375
MaritalStatus.S MaritalStatus.S 0.0187173
DependentsOther DependentsOther 0.0081145
HoursWorkedPerWeek HoursWorkedPerWeek 0.0062675
M.OCURRENCIA M.OCURRENCIA 0.0036592
DependentChildren DependentChildren 0.0027026
MaritalStatus.M MaritalStatus.M 0.0022443
DaysWorkedPerWeek DaysWorkedPerWeek 0.0008063
Gender.M Gender.M 0.0003364
PartTimeFullTime.F PartTimeFullTime.F 0.0002007

Escenario 4

set.seed(2021)
baggig.fit= bagging(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.NOT_OUT)

baggig.fit
## 
## Bagging regression trees with 25 bootstrap replications 
## 
## Call: bagging.data.frame(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.NOT_OUT)

Error

pred_bagging=as.vector(predict(baggig.fit,BASE_VAL_NOT_OUT))
real=BASE_VAL_NOT_OUT$UltimateIncurredClaimCost
#MAE

mean(abs(pred_bagging-real), na.rm = TRUE)
## [1] 3010.051
#R-Cuadrado
(1-(sum((real-pred_bagging)^2)/sum((real-mean(real))^2)))*100
## [1] 11.23597
#calculate variable importance
VI <- data.frame(var= row.names(varImp(baggig.fit)), imp=varImp(baggig.fit))

#sort variable importance descending
VI_plot <- VI[order(VI$Overall, decreasing=TRUE),]
kable(VI_plot)
var Overall
WeeklyWages WeeklyWages 0.1555472
A.OCURRENCIA A.OCURRENCIA 0.1410475
Age Age 0.0798281
MaritalStatus.S MaritalStatus.S 0.0494984
Gender.M Gender.M 0.0401547
HoursWorkedPerWeek HoursWorkedPerWeek 0.0202507
days_Report days_Report 0.0101590
PartTimeFullTime.F PartTimeFullTime.F 0.0044432
MaritalStatus.M MaritalStatus.M 0.0003162
DaysWorkedPerWeek DaysWorkedPerWeek 0.0000000
DependentChildren DependentChildren 0.0000000
DependentsOther DependentsOther 0.0000000
M.OCURRENCIA M.OCURRENCIA 0.0000000

BOOSTING : Gradient Boost

Escenario 1

set.seed(2021)
gbm.fit= gbm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3)
## Distribution not specified, assuming gaussian ...
gbm.fit
## gbm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## There were 13 predictors of which 10 had non-zero influence.

Error

pred_gbm=as.vector(predict(gbm.fit,BASE_VAL_3))
real=BASE_VAL_3$UltimateIncurredClaimCost
#MAE
mean(abs(pred_gbm-real), na.rm = TRUE)
## [1] 11096.3
#R-Cuadrado
(1-(sum((real-pred_gbm)^2)/sum((real-mean(real))^2)))*100
## [1] 6.994316
summary(
  gbm.fit, 
  cBars = 13,
  method = relative.influence, # also can use permutation.test.gbm
  las = 2
  )

##                                   var    rel.inf
## A.OCURRENCIA             A.OCURRENCIA 38.3164846
## WeeklyWages               WeeklyWages 36.6781550
## Age                               Age 10.8373733
## days_Report               days_Report  6.7477085
## DependentsOther       DependentsOther  3.8643538
## Gender.M                     Gender.M  0.8902729
## DependentChildren   DependentChildren  0.7458376
## MaritalStatus.S       MaritalStatus.S  0.7328978
## PartTimeFullTime.F PartTimeFullTime.F  0.6358794
## HoursWorkedPerWeek HoursWorkedPerWeek  0.5510370
## MaritalStatus.M       MaritalStatus.M  0.0000000
## DaysWorkedPerWeek   DaysWorkedPerWeek  0.0000000
## M.OCURRENCIA             M.OCURRENCIA  0.0000000

Escenario 2

set.seed(2021)
gbm.fit= gbm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO3[,POCO_SIG])
## Distribution not specified, assuming gaussian ...
gbm.fit
## gbm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO3[, 
##     POCO_SIG])
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## There were 9 predictors of which 8 had non-zero influence.

Error

pred_gbm=as.vector(predict(gbm.fit,BASE_VAL_3[,POCO_SIG]))
real=BASE_VAL_3$UltimateIncurredClaimCost


#MAE
mean(abs(pred_gbm-real), na.rm = TRUE)
## [1] 11102.62
#R_cuadrado

(1-(sum((real-pred_gbm)^2)/sum((real-mean(real))^2)))*100
## [1] 6.974192
summary(
  gbm.fit, 
  cBars = 13,
  method = relative.influence, # also can use permutation.test.gbm
  las = 2
  )

##                                   var    rel.inf
## A.OCURRENCIA             A.OCURRENCIA 38.7258381
## WeeklyWages               WeeklyWages 36.8715889
## Age                               Age 10.8700136
## days_Report               days_Report  6.8378804
## DependentsOther       DependentsOther  3.8749683
## PartTimeFullTime.F PartTimeFullTime.F  1.1490445
## MaritalStatus.S       MaritalStatus.S  0.9224584
## DependentChildren   DependentChildren  0.7482078
## MaritalStatus.M       MaritalStatus.M  0.0000000

Escenario 3

set.seed(2021)
gbm.fit= gbm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.Z)
## Distribution not specified, assuming gaussian ...
gbm.fit
## gbm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.Z)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## There were 13 predictors of which 10 had non-zero influence.

Error

pred_gbm=as.vector(predict(gbm.fit,BASE_VAL_Z))

estand_pred=pred_gbm*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)
estand_real=BASE_VAL_Z$UltimateIncurredClaimCost*sd(BASE3$UltimateIncurredClaimCost)+mean(BASE3$UltimateIncurredClaimCost)

#MAE
mean(abs(estand_pred-estand_real), na.rm = TRUE)
## [1] 11096.3
# R-Cuadrado
(1-(sum((estand_real-estand_pred)^2)/sum((estand_real-mean(estand_real))^2)))*100
## [1] 6.994316
summary(
  gbm.fit, 
  cBars = 13,
  method = relative.influence, # also can use permutation.test.gbm
  las = 2
  )

##                                   var    rel.inf
## A.OCURRENCIA             A.OCURRENCIA 38.3164846
## WeeklyWages               WeeklyWages 36.6781550
## Age                               Age 10.8373733
## days_Report               days_Report  6.7477085
## DependentsOther       DependentsOther  3.8643538
## Gender.M                     Gender.M  0.8902729
## DependentChildren   DependentChildren  0.7458376
## MaritalStatus.S       MaritalStatus.S  0.7328978
## PartTimeFullTime.F PartTimeFullTime.F  0.6358794
## HoursWorkedPerWeek HoursWorkedPerWeek  0.5510370
## MaritalStatus.M       MaritalStatus.M  0.0000000
## DaysWorkedPerWeek   DaysWorkedPerWeek  0.0000000
## M.OCURRENCIA             M.OCURRENCIA  0.0000000

Escenario 4

set.seed(2021)
gbm.fit= gbm(UltimateIncurredClaimCost ~., data=BASE_ENTRENAMIENTO.NOT_OUT)
## Distribution not specified, assuming gaussian ...
gbm.fit
## gbm(formula = UltimateIncurredClaimCost ~ ., data = BASE_ENTRENAMIENTO.NOT_OUT)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## There were 13 predictors of which 7 had non-zero influence.

Error

pred_gbm=as.vector(predict(gbm.fit,BASE_VAL_NOT_OUT))
real=BASE_VAL_NOT_OUT$UltimateIncurredClaimCost

#MAE
mean(abs(pred_gbm-real), na.rm = TRUE)
## [1] 2965.066
#R_cuadrado
(1-(sum((real-pred_gbm)^2)/sum((real-mean(real))^2)))*100
## [1] 13.61934
summary(
  gbm.fit, 
  cBars = 13,
  method = relative.influence, # also can use permutation.test.gbm
  las = 2
  )

##                                   var   rel.inf
## WeeklyWages               WeeklyWages 56.806250
## A.OCURRENCIA             A.OCURRENCIA 17.789311
## Age                               Age 11.157837
## Gender.M                     Gender.M  4.074334
## HoursWorkedPerWeek HoursWorkedPerWeek  3.999910
## days_Report               days_Report  3.506309
## MaritalStatus.S       MaritalStatus.S  2.666050
## MaritalStatus.M       MaritalStatus.M  0.000000
## DependentChildren   DependentChildren  0.000000
## DependentsOther       DependentsOther  0.000000
## PartTimeFullTime.F PartTimeFullTime.F  0.000000
## DaysWorkedPerWeek   DaysWorkedPerWeek  0.000000
## M.OCURRENCIA             M.OCURRENCIA  0.000000