Tweedie Distribution & the Generalized Linear Model

Tweedie distributions are a special case of exponential dispersion models and are particularly useful in generalized linear models, as in fitting claims data to statistical distributions.

We use exponential dispersion models (and particularly the Tweedie distribution) for pure premium approaches for actuarial estimations. There are particular cases where the Tweedie compound Poisson distribution is suitable and appropriate for a given regression.

See here for a useful overview on using a Tweedie GLM.

Fitting regression models to insurance loss data has always been problematic. The problem is particularly acute for data from individual insurance policies where most of the losses are zero, and for those policies with a positive loss, the losses are highly skewed. Most of the traditional regression models do not deal with a mixture of discrete losses of zero and continuous positive losses. One way of dealing with this problem is to fit separate models to the frequency and severity, and estimate the pure premium by multiplying the result of each model. One can take issue with assumption of “separate” models.

Loosely speaking, my current understanding of the Tweedie convergence theorem gives that any exponential dispersion model that exhibits a "variance-to-mean" power law can be modeled with the Tweedie distribution.

Sometimes, a regresson analysis on a Poisson-modeled parameter p is desired but only results in an overdispersed Poisson distribution, where positive probability loss severities are strictly clustered around the discrete claim counts {0, 1, 2, ...}.

However, with the Tweedie distribution for regression modeling, we proceed generally similarly as we would for a "log-linked" GLM, with two compound Poisson distributed parameters.

To do so, if the input data involves only losses from individual claims, then the corresponding output from the model is valid; however, an actuary must make sure to not make inappropriate assumptions (such as assuming losses involved in multiple claims will not distort the output of such a regression).

Provided by the CAS article, here is some code for the Tweedie models exhibited:

Models 1 & 2

library(statmod)
library(tweedie)
theta=20
alpha=.5
lambda=2
set.seed(12345)
#
# Smythe/Jorgensen parameterization of the tweedie
#
tau=alpha*theta
p=(alpha+2)/(alpha+1)
mu=lambda*tau
phi=lambda^(1-p)*tau^(2-p)/(2-p)
#
# simulate compound poisson distribution
#
n=10000
count=rpois(n,lambda)
loss=rep(0,n)
nzloss=count>0
for (i in (1:n)[nzloss]){
  loss[i]=sum(rgamma(count[i],shape=alpha,scale=theta))
  }
#                          
# get the density of the tweedie over the range of loss
#
xmax=5*alpha*theta*lambda
x=seq(0,xmax,by=xmax/100)
tweedie.den=dtweedie(x,p,mu,phi)
in.hist=loss<=xmax
#
# compare graphically the compound poisson and the tweedie
#
lam=lambda
denmax=max(tweedie.den)
hist(loss[in.hist],breaks=x,freq=F,xlab="Loss Amount",ylim=c(0,denmax),
     main="Figure 1 - Compound Poisson/Tweedie Comparison")
text(x=50,y=.08,labels=expression(paste(lambda == 2,", ",alpha == 0.5,", ",
      theta == 20,", ",p == 1.667)),cex=1.25)
par(new=T)
plot(x,tweedie.den,xlab="",ylab="",main="",type="l",lwd=3,col="blue",
     ylim=c(0,denmax))
#
#
theta=.2
alpha=49
lambda=2
set.seed(12345)
#
# Smythe/Jorgensen parameterization of the tweedie
#
tau=alpha*theta
p=(alpha+2)/(alpha+1)
mu=lambda*tau
phi=lambda^(1-p)*tau^(2-p)/(2-p)
#
# simulate compound poisson distribution
#
n=10000
count=rpois(n,lambda)
loss=rep(0,n)
nzloss=count>0
for (i in (1:n)[nzloss]){
  loss[i]=sum(rgamma(count[i],shape=alpha,scale=theta))
  }
#                          
# get the density of the tweedie over the range of loss
#
xmax=5*alpha*theta*lambda
x=seq(0,xmax,by=xmax/100)
tweedie.den=dtweedie(x,p,mu,phi)
in.hist=loss<=xmax
#
# compare graphically the compound poisson and the tweedie
#
lam=lambda
denmax=max(tweedie.den)
hist(loss[in.hist],breaks=x,freq=F,xlab="Loss Amount",ylim=c(0,denmax),
     main="Figure 2 - Compound Poisson/Tweedie Comparison")
text(x=50,y=.08,labels=expression(paste(lambda == 2,", ",alpha == 49,", ",
      theta == .2,", ",p == 1.02)),cex=1.25)
par(new=T)
plot(x,tweedie.den,xlab="",ylab="",main="",type="l",lwd=3,col="blue",
     ylim=c(0,denmax))
#
#

Models 3 & 4

library(statmod)
library(tweedie)
set.seed(12345)
n=50000
s=sample(1:n,500)
x1=log(runif(n,min=.5,max=1.5))
x2=log(runif(n,min=.5,max=1.5))

real.s0=log(10)
real.s1=1
real.s2=.25
theta=exp(real.s0+real.s1*x1+real.s2*x2)
alpha=.5
real.f0=log(.05)
real.f1=.25
real.f2=1
lambda=exp(real.f0+real.f1*x1+real.f2*x2)
#
# wurthrich parameterization of the tweedie
#
tau=alpha*theta      
p=(alpha+2)/(alpha+1)
mu=lambda*tau
phi=lambda^(1-p)*tau^(2-p)/(2-p)
#
# simulate compound poisson distribution
#
count=rpois(n,lambda)
loss=rep(0,n)
nzloss=count>0
for (i in (1:n)[nzloss]){
  loss[i]=sum(rgamma(count[i],shape=alpha,scale=theta[i]))
  }
table(count)
sum(count)
length(count)-table(count)[1] 
#
# determine p from a gamma glm on the exposures that had a loss
# to be perfoectly correct, use individual claims
# but since only 49 out of 2332 postiive losses has more than one claim
# I did the glm on the positive total losses
#
pos.loss=loss>0
glmsev=glm(loss[pos.loss]~x1[pos.loss]+x2[pos.loss],family=Gamma(link="log"))
summary(glmsev)
phi.sev=summary(glmsev)$dispersion
alpha1=1/phi.sev
p.est=(2+alpha1)/(1+alpha1)
p.est
glm.tw=glm(loss~x1+x2,family=tweedie(var.power=p.est,link.power=0))
start=c(glm.tw$coefficients/2,glm.tw$coefficients/2)
#
cpt.nllike=function(parm){
  f0=parm[1]
  f1=parm[2]
  f2=parm[3]
  s0=parm[4]
  s1=parm[5]
  s2=parm[6]
  lambda=exp(f0+f1*x1+f2*x2)
  theta=exp(s0+s1*x1+s2*x2)
  tau=alpha1*theta
  p=(alpha1+2)/(alpha1+1)
  mu=lambda*tau
  phi=lambda^(1-p)*tau^(2-p)/(2-p)
  ll=log(dtweedie(loss,p,mu,phi))
  return(-sum(ll))
  }
bfmle=optim(start,cpt.nllike)
bfmle
lambda1=exp(bfmle$par[1]+bfmle$par[2]*x1+bfmle$par[3]*x2)
theta1=exp(bfmle$par[4]+bfmle$par[5]*x1+bfmle$par[6]*x2)
tau1=alpha1*theta1
windows(record=T)
plot.range=range(lambda1*tau1,glm.tw$fitted.values)
plot(lambda1[s]*tau1[s],glm.tw$fitted.values[s],xlim=plot.range,ylim=plot.range,
     col="black",xlab="MLE Estimate",ylab="GLM Estimate",
     main="Figure 3 - Compare GLM and MLE Estimates",type="p",pch=20)
abline(0,1,lwd=3,col="skyblue")
#
xmax=ceiling(max(1*tau1))
x=seq(0,xmax,by=xmax/100)
#
i.samp=sample(1:n,1)
mu.glm=glm.tw$fitted.values[i.samp]
phi.glm=summary(glm.tw)$dispersion
tweedieglm.den=dtweedie(x[-1],p.est,mu.glm,phi.glm)
glm.pr0=dtweedie(x[1],p.est,mu.glm,phi.glm)
mu.mle=lambda1[i.samp]*tau1[i.samp]
phi.mle=lambda1[i.samp]^(1-p.est)*tau1[i.samp]^(2-p.est)/(2-p.est)
tweediemle.den=dtweedie(x[-1],p.est,mu.mle,phi.mle)
mle.pr0=dtweedie(x[1],p.est,mu.mle,phi.mle)
ymax=max(tweedieglm.den,tweediemle.den)
plot(x[-1],tweediemle.den,ylim=c(0,ymax),xlim=c(0,xmax),type="l",lwd=8,
     main="Figure 4 - Density for Positive Loss",sub=paste("x1=",round(x1[i.samp],3),
     " x2=",round(x2[i.samp],3)),ylab="Density",col="black",xlab="Loss")
par(new=T)
plot(x[-1],tweedieglm.den,ylim=c(0,ymax),xlim=c(0,xmax),type="l",lwd=3,
     col="skyblue",main="",sub="",ylab="",axes=F,xlab="")
legend("topright",legend=c("MLE","GLM"),col=c("black","skyblue"),lwd=c(5,3))
xt=xmax/2
yt=ymax/2
text(xt,yt,paste("Pr(Loss=0|MLE)=",round(mle.pr0,3)),cex=1.25)
text(xt,yt*.85,paste("Pr(Loss=0|GLM)=",round(glm.pr0,3)),cex=1.25)
                
mu.mle
mu.glm
lambda1[i.samp]
tau1[i.samp]
phi.mle
phi.glm                      
#
summary(glm.tw)
glm.tw$coefficients
#bfmle coefficients
log(1/summary(glmsev)$dispersion)+bfmle$par[1]+bfmle$par[4]
bfmle$par[2]+bfmle$par[5]
bfmle$par[3]+bfmle$par[6]
#true coefficients
log(alpha)+real.s0+real.f0
real.f1+real.s1
real.f2+real.s2

For more, see Glenn Meyers' powerpoint CAS presentation, "Predictive Modeling with the Tweedie Distribution." Further discussion of and experimentation with the Tweedie Distribution and GLMs to come soon.

Tweedie Distribution & the Generalized Linear Model

Models 1 & 2

Models 3 & 4

Educating Children in Actuarial Science

On Remote Sensing and Bathymetry