# Gibbs sampler for a simple UC model using simulation smoother
############################################################
= function(S, starting.values, priors){
UC.Gibbs.sampler # Initialize the data
= starting.values
aux = nrow(aux$Y)
T <- diag(T)
i_matrix <- matrix(0, T, 1)
i 1, 1] <- 1
i[# Posteriors list
= list(
posteriors tau = matrix(NA,T,S),
tau_0 = matrix(NA,1,S),
sigma = matrix(NA,2,S)
)= crossprod(priors$H)
HH
for (s in 1:S){
# Sampling tau_0
###########################
= 1/priors$tau_0.v
tau_0.v.inv = 1/((1/aux$sigma[1]) + tau_0.v.inv )
V.tau_0.bar = V.tau_0.bar %*% ( (1/aux$sigma[1])*aux$tau[1] + tau_0.v.inv*priors$tau_0 )
tau_0.bar = rnorm(1,as.vector(tau_0.bar),sqrt(V.tau_0.bar))
tau_0.draw $tau_0 = tau_0.draw
aux
# Sampling sigma
###########################
# sigma of tau (sigma_eta)
= as.numeric(priors$sigma.s + crossprod(priors$H%*%aux$tau - i%*%priors$tau_0))
sigma.eta.s = priors$sigma.nu + T
sigma.eta.nu = sigma.eta.s/rchisq(1,sigma.eta.nu)
sigma.eta.draw # sigma of errors (sigma)
= as.numeric(priors$sigma.s + crossprod(aux$Y - aux$tau))
sigma.e.s = priors$sigma.nu + T
sigma.e.nu = sigma.e.s/rchisq(1,sigma.e.nu)
sigma.e.draw $sigma = c(sigma.eta.draw,sigma.e.draw)
aux
# Sampling tau
###########################
= (1/aux$sigma[2])*i_matrix + (1/aux$sigma[1])*HH
V.tau.inv = 0.5*(V.tau.inv + t(V.tau.inv))
V.tau.inv = (1/aux$sigma[2])*aux$Y + (1/aux$sigma[1])*t(priors$H)%*%i*priors$tau_0
b.tau = t(bandchol(V.tau.inv))
precision.L = rnorm(T)
epsilon = forwardsolve(precision.L, b.tau)
b.tau.tmp = backsolve(t(precision.L), b.tau.tmp + epsilon)
tau.draw $tau = tau.draw
aux
$tau[,s] = aux$tau
posteriors$tau_0[,s] = aux$tau_0
posteriors$sigma[,s] = aux$sigma
posteriors
if (s%%1000==0){cat(" ",s)}
}
= list(
output posterior = posteriors,
last.draw = aux
)return(output)
}
Bayesian Unobserved Component Models
Abstract. We present the basics of Bayesian estimation and inference for unobserved component models on the example of a local-level model. The range of topics includes the conjugate prior analysis using normal-inverted-gamma 2 distribution and its extensions focusing on hierarchical modelling, conditional heteroskedasticity, and Student-t error terms. We scrutinise Bayesian forecasting and sampling from the predictive density.
Keywords. Unobserved Component Models, Local-Level Model, State-Space Bayesian Inference, Forecasting, Heteroskedasticity, Hierarchical Modelling, Gibbs Sampler, Simulation Smoother, Precision Sampling
Unobserved component models
Unobserved Component (UC) models are a popular class of models in macroeconometrics that use the state-space representation for unit-root nonstationary time series. The simple formulation of the model equations decomposing the series into a non-stationary and stationary component facilitates economic interpretations and good forecasting performance.
A simple local-level model
The model is set for a univariate time series whose observation at time \(t\) is denoted by \(y_t\). It decomposes the variable into a stochastic trend component, \(\tau_t\), and a stationary error component, \(\epsilon_t\). The former follows a Gaussian random walk process with the conditional variance \(\sigma_\eta^2\), and the latter is zero-mean normally distributed with the variance \(\sigma^2\). These are expressed as the model equations:
\[ \begin{align} y_t &= \tau_t + \epsilon_t,\\ \tau_t &= \tau_{t-1} + \eta_t,\\ \epsilon_t &\sim\mathcal{N}\left(0, \sigma^2\right),\\ \eta_t &\sim\mathcal{N}\left(0, \sigma_\eta^2\right), \end{align} \] where the initial condition \(\tau_0\) is a parameter of the model.
Matrix notation for the model
To simplify the notation and the derivations introduce matrix notation for the model. Let \(T\) be the available sample size for the variable \(y\). Define a \(T\)-vector of zeros, \(\mathbf{0}_T\), and of ones, \(\boldsymbol\imath_T\), the identity matrix of order \(T\), \(\mathbf{I}_T\), as well as \(T\times1\) vectors:
\[ \begin{align} \mathbf{y} = \begin{bmatrix} y_1\\ \vdots\\ y_T \end{bmatrix},\quad \boldsymbol\tau = \begin{bmatrix} \tau_1\\ \vdots\\ \tau_T \end{bmatrix},\quad \boldsymbol\epsilon = \begin{bmatrix} \epsilon_1\\ \vdots\\ \epsilon_T \end{bmatrix},\quad \boldsymbol\eta = \begin{bmatrix} \eta_1\\ \vdots\\ \eta_T \end{bmatrix},\qquad \mathbf{i} = \begin{bmatrix} 1\\0\\ \vdots\\ 0 \end{bmatrix}, \end{align} \] and a \(T\times T\) matrix \(\mathbf{H}\) with the elements:
\[ \begin{align} \mathbf{H} = \begin{bmatrix} 1 & 0 & \cdots & 0 & 0\\ -1 & 1 & \cdots & 0 & 0\\ 0 & -1 & \cdots & 0 & 0\\ \vdots & \vdots & \ddots & \vdots & \vdots\\ 0 & 0 & \cdots & 1 & 0\\ 0 & 0 & \cdots & -1 & 1 \end{bmatrix}. \end{align} \]
Then the model can be written in a concise notation as: \[ \begin{align} \mathbf{y} &= \mathbf{\tau} + \boldsymbol\epsilon,\\ \mathbf{H}\boldsymbol\tau &= \mathbf{i} \tau_0 + \boldsymbol\eta,\\ \boldsymbol\epsilon &\sim\mathcal{N}\left(\mathbf{0}_T, \sigma^2\mathbf{I}_T\right),\\ \boldsymbol\eta &\sim\mathcal{N}\left(\mathbf{0}_T, \sigma_\eta^2\mathbf{I}_T\right). \end{align} \]
Likelihood function
The model equations imply the predictive density of the data vector \(\mathbf{y}\). To see this, consider the model equation as a linear transformation of a normal vector \(\boldsymbol\epsilon\). Therefore, the data vector follows a multivariate normal distribution given by: \[ \begin{align} \mathbf{y}\mid \boldsymbol\tau, \sigma^2 &\sim\mathcal{N}_T\left(\boldsymbol\tau, \sigma^2\mathbf{I}_T\right). \end{align} \]
This distribution determines the shape of the likelihood function that is defined as the sampling data density: \[ \begin{align} L(\boldsymbol\tau,\sigma^2|\mathbf{y})\equiv p\left(\mathbf{y}\mid\boldsymbol\tau, \sigma^2 \right). \end{align} \]
The likelihood function that for the sake of the estimation of the parameters, and after plugging in data in place of \(\mathbf{y}\), is considered a function of parameters \(\boldsymbol\tau\) and \(\sigma^2\) is given by: \[ \begin{align} L(\boldsymbol\tau,\sigma^2|\mathbf{y}) = (2\pi)^{-\frac{T}{2}}\left(\sigma^2\right)^{-\frac{T}{2}}\exp\left\{-\frac{1}{2}\frac{1}{\sigma^2}(\mathbf{y} - \boldsymbol\tau)'(\mathbf{y} - \boldsymbol\tau)\right\}. \end{align} \]
Prior distributions
The state equation for \(\boldsymbol\tau\) can be rewritten as follows: \[ \begin{gather} \boldsymbol\tau = \mathbf{H}^{-1} \mathbf{i} \tau_0+\mathbf{H}^{-1}\boldsymbol\eta \\ \boldsymbol\eta \sim \mathcal{N}(\mathbf{0}_T, \sigma_\eta^2 \mathbf{I}_T) \\ \mathbf{H}^{-1} \boldsymbol\eta \sim \mathcal{N}(\mathbf{0}_T, \sigma_\eta^2\left(\mathbf{H}^{\prime} \mathbf{H}\right)^{-1}) \end{gather} \] Using the state equation for \(\boldsymbol\tau\) above, we can derive the prior distribution of \(\boldsymbol\tau\) as: \[ \begin{align} \boldsymbol\tau | \tau_0, \sigma_\eta^2 &\sim \mathcal{N}_T(\mathbf{H}^{-1} \mathbf{i} \tau_0, \sigma_\eta^2(\mathbf{H}^{\prime} \mathbf{H})^{-1}) \\ &\propto \exp \left\{-\frac{1}{2} \frac{1}{\sigma_\eta^2}\left(\boldsymbol\tau-\mathbf{H}^{-1} \mathbf{i} \tau_0\right)^{\prime}\left(\mathbf{H}^{\prime} \mathbf{H}\right)\left(\boldsymbol\tau-\mathbf{H}^{-1} \mathbf{i} \tau_0\right)\right\} \end{align} \] Next, the prior distribution of \(\tau_0\) is defined as uni-variate normal: \[ \begin{align} \tau_0 &\sim \mathcal{N}(\underline{\tau_0}, \underline{V_{\tau_{0}}}) \\ &\propto \exp \left\{-\frac{1}{2}(\tau_0-\underline{\tau_0})^{\prime} \underline{V_{\tau_{0}}}^{-1}(\tau_0-\underline{\tau_0})\right\} \end{align} \] and the prior distribution of \(\sigma_\eta^2\) is inverted gamma 2: \[ \begin{align} \sigma_\eta^2 &\sim \mathcal{IG}2(\underline{s_\eta}, \underline{v_\eta}) \\ &\propto (\sigma_\eta^2)^{-\frac{\underline{v}+2}{2}} \exp \left\{-\frac{1}{2} \frac{s}{\sigma_\eta^2}\right\} \end{align} \]
The prior distribution of \(\sigma^2\) is the following: \[ \begin{align} \sigma^2 &\sim \mathcal{IG}2(\underline{s}, \underline{v}) \\ &\propto (\sigma^2)^{-\frac{\underline{v}+2}{2}} \exp \left\{-\frac{1}{2} \frac{s}{\sigma^2}\right\} \end{align} \]
The Joint prior distribution of the parameters of the model \(\boldsymbol\tau\), \(\tau_0\), \(\sigma^2_{\eta}\), and \(\sigma^2\) is then given by: \[ \begin{aligned} p(\boldsymbol\tau,\tau_{0},\sigma_{\eta}^{2},\sigma^{2}) = p(\boldsymbol\tau|\tau_{0},\sigma_{\eta}^{2}) \space p(\tau_{0}) \space p(\sigma_{\eta}^{2}) \space p(\sigma^{2}) \end{aligned} \] where the individual distributions on the RHS are as specified above.
Finally, given that \(\boldsymbol\epsilon\) follows normal distribution, \[ \begin{align} \boldsymbol\epsilon\mid\sigma^2 &\sim \mathcal{N}(\mathbf{0}_T, \sigma^2 \mathbf{I}_T) \end{align} \] the prior distribution of \(\boldsymbol\epsilon\) given \(\sigma^2\) is proportional to: \[ \begin{align} \boldsymbol\epsilon | \sigma^2 &\propto \exp \left\{-\frac{1}{2} \frac{1}{\sigma^2} \boldsymbol\epsilon^{\prime}\boldsymbol\epsilon\right\} \end{align} \]
Bayesian estimation
Derivation of full conditional posterior distributions
Full conditional posterior distribution could be estimated basing on Bayes’ theorem and priors shown above. Therefore, the full conditional posterior distribution is proportional to the product of likelihood functions and priors. \[ \begin{aligned} p(\tau,\tau_{0},\sigma^{2},\sigma_{\eta}^{2}|\mathbf{y}) & \propto p(\mathbf{y}|\tau,\tau_{0},\sigma^{2},\sigma_{\eta}^{2}) \space p(\tau,\tau_{0},\sigma^{2},\sigma_{\eta}^{2}) \end{aligned} \] The joint prior is as specified in the section above.
Two Conditional likelihood functions to derive the full Conditional posterior distribution of \(\tau,\tau_{0},\sigma^{2},\sigma_{\eta}^{2}\) is as follows: \[ \begin{align} L(\boldsymbol\tau,\sigma^{2}|\mathbf{y},\tau_{0},\sigma_{\eta}^{2}) \propto \left( \sigma^{2} \right)^{-\frac{T}{2}} \times\exp\left\{ -\frac{1}{2}\sigma^{-2}\left( \mathbf{y}-\boldsymbol\tau \right)'\left( \mathbf{y}-\boldsymbol\tau \right) \right\} \end{align} \] and \[ \begin{align} L(\tau_{0},\sigma_{\eta}^{2}|\mathbf{y},\boldsymbol\tau,\sigma^{2}) \propto \left( \sigma_{\eta}^{2} \right)^{-\frac{T}{2}} \times\exp\left\{-\frac{1}{2}\sigma_{\eta}^{-2}\left(\mathbf{H}\boldsymbol\tau -\mathbf{i}\tau_{0}\right)'\left(\mathbf{H}\boldsymbol\tau-\mathbf{i}\tau_{0}\right)\right\} \end{align} \]
The full Conditional posterior distribution of \(\boldsymbol\tau\) is derived as follows: \[ \begin{align} P(\boldsymbol\tau|\mathbf{y},\tau_{0},\sigma^{2},\sigma_{\eta}^{2})&\propto L(\boldsymbol\tau,\sigma^{2}|\mathbf{y},\tau_{0},\sigma_{\eta}^{2}) \space P(\boldsymbol\tau|\tau_{0},\sigma_{\eta}^{2})\\ \\&\propto \exp\left\{ -\frac{1}{2}\sigma^{-2}\left( \mathbf{y}-\boldsymbol\tau \right)'\left( \mathbf{y}-\boldsymbol\tau \right) \right\}\\ &\qquad\times \exp\left\{ -\frac{1}{2}\sigma_{\eta}^{-2}\left(\boldsymbol\tau-\mathbf{H}^{-1} \mathbf{i}\tau_{0}\right)'\mathbf{H}'\mathbf{H}\left( \boldsymbol\tau-\mathbf{H}^{-1} \mathbf{i}\tau_{0} \right) \right\} \end{align} \] that after completing th squares results in a multivariate normal full conditional posterior distribution: \[ \begin{align} P(\boldsymbol\tau &|\mathbf{y},\tau_{0},\sigma^{2},\sigma_{\eta}^{2}) \sim \mathcal{N}(\overline{\boldsymbol\tau},\overline{V_{\boldsymbol\tau}}) \\ \\ \overline{V_{\boldsymbol\tau}} &= \left[ \sigma^{-2}\mathbf{I}_T +\sigma_{\eta}^{-2}\mathbf{H}'\mathbf{H}\right]^{-1}\\ \overline{\boldsymbol\tau} &= \overline{V_{\tau}}\left[\sigma^{-2}\mathbf{y} + \sigma_{\eta}^{-2}\mathbf{H}'\mathbf{i}\tau_{0} \right] \end{align} \] The full conditional posterior distribution of \(\tau_{0}\) is the following: \[ \begin{align} P(\tau_{0}|\mathbf{y},\boldsymbol\tau,\sigma^{2},\sigma_{\eta}^{2}) &\propto L(\tau_{0}|\mathbf{y},\boldsymbol\tau,\sigma^{2},\sigma_{\eta}^{2}) \space P(\tau_{0}|\sigma_{\eta}^{2}) \\ \\ &\propto \exp\left\{ -\frac{1}{2}\sigma_{\eta}^{-2}\left(\mathbf{H}\boldsymbol\tau -\mathbf{i}\tau_{0}\right)'\left(\mathbf{H}\boldsymbol\tau-\mathbf{i}\tau_{0}\right)\right\}\\ &\times \exp\left\{ -\frac{1}{2}\left(\tau_{0}- \underline{\tau_0}\right)'\underline{V_{\tau_{0}}}^{-1}\left(\tau_{0}- \underline{\tau_0}\right) \right\} \end{align} \] and is given by a uni-variate normal distribution: \[ \begin{align} P(\tau_{0} &|\mathbf{y},\boldsymbol\tau,\sigma^{2},\sigma_{\eta}^{2}) \sim N(\overline{\tau_{0}},\overline{V_{\tau_{0}}}) \\ \\ \overline{V_{\tau_{0}}}&=\left[ \sigma_{\eta}^{-2}\mathbf{i}'\mathbf{i} +\underline{V_{\tau_{0}}}^{-1}\right]^{-1} =\left[ \sigma_{\eta}^{-2} +\underline{V_{\tau_{0}}}^{-1}\right]^{-1} \\ \overline{\tau_{0}}&=\overline{V_{\tau_{0}}}\left[ \sigma_{\eta}^{-2} \mathbf{i}'\mathbf{H}\boldsymbol\tau +\underline{V_{\tau_{0}}}^{-1}\underline{\tau_{0}}\right]\\ &= \overline{V_{\tau_{0}}}\left[ \sigma_{\eta}^{-2} \tau_1 +\underline{V_{\tau_{0}}}^{-1}\underline{\tau_{0}}\right] \end{align} \] The full conditional posterior distribution of \(\sigma_{\eta}^{2}\) is derived as: \[ \begin{align} P(\sigma_{\eta}^{2}|\mathbf{y},\boldsymbol\tau,\tau_{0},\sigma^{2}) &\propto L(\sigma_{\eta}^{2}|\mathbf{y},\boldsymbol\tau,\tau_{0},\sigma^{2}) \space P(\sigma_{\eta}^{2}) \\ \\ &\propto \left( \sigma_{\eta}^{2} \right)^{-\frac{T}{2}} \exp\left\{ -\frac{1}{2}\sigma_{\eta}^{-2}\left(\mathbf{H}\boldsymbol\tau -\mathbf{i}\tau_{0}\right)'\left(\mathbf{H}\boldsymbol\tau -\mathbf{i}\tau_{0}\right)\right\} \\ &\times \left( \sigma_{\eta}^{2} \right)^{-\frac{\underline{\upsilon}+2}{2}} \exp\left\{ -\frac{1}{2}\frac{\underline{s}}{\sigma_{\eta}^{2}}\right\} \end{align} \] where \[ \begin{align} P(\sigma_{\eta}^{2}&|\mathbf{y},\boldsymbol\tau,\tau_{0},\sigma^{2}) \sim \mathcal{IG}2(\overline{s_{\eta}},\overline{\upsilon_{\eta}}) \\ \\ \overline{s_{\eta}}&=\underline{s} + \left(\mathbf{H}\boldsymbol\tau -\mathbf{i}\tau_{0}\right)'\left(\mathbf{H}\boldsymbol\tau -\mathbf{i}\tau_{0}\right) \\ \overline{\upsilon_{\eta}}&= \underline{\upsilon}+T \end{align} \] Finally, the full conditional posterior distribution of \(\sigma^{2}\) is the following: \[ \begin{align} P(\sigma^{2}|\mathbf{y},\boldsymbol\tau,\tau_{0},\sigma_{\eta}^{2})&=L(\sigma^{2}|\mathbf{y},\boldsymbol\tau,\tau_{0},\sigma_{\eta}^{2}) \space P(\sigma^{2}) \\ \\ &\propto \left( \sigma^{2} \right)^{-\frac{T}{2}} \exp\left\{ -\frac{1}{2}\sigma^{-2}\left(\mathbf{y}-\boldsymbol\tau\right)'\left(\mathbf{y}-\boldsymbol\tau\right)\right\} \\ &\times \left( \sigma^{2} \right)^{-\frac{\underline{\upsilon}+2}{2}} \exp\left\{ -\frac{1}{2}\frac{\underline{s}}{\sigma^{2}}\right\} \end{align} \] where \[ \begin{align} P(\sigma^{2}|&\mathbf{y},\boldsymbol\tau,\tau_{0},\sigma_{\eta}^{2}) \sim \mathcal{IG}2(\overline{s},\overline{\upsilon}) \\ \\ \overline{s}&=\underline{s} + \left(\mathbf{y}-\boldsymbol\tau\right)'\left(\mathbf{y}-\boldsymbol\tau\right) \\ \overline{\upsilon}&= \underline{\upsilon}+T \end{align} \]
Gibbs sampler
A Gibbs Sampler can be applied using the following steps:
At each iteration \(s\) where \(s\) goes from 1 to \(S\),
Step 1. Draw \(\tau_0^{(s)}\) from the \(N({\tau_{0}}, {V_{\tau_{0}}})\) distribution and collect \(\tau_0\).
Step 2. Draw \({\sigma^2_{\eta}}^{(s)}\) from the \(\mathcal{IG}2({S_\eta}, {v_\eta})\) distribution and collect \(\sigma_\eta^2\).
Step 3. Draw \({\sigma^2}^{(s)}\) from the \(\mathcal{IG}2(S, v)\) distribution and collect \(\sigma^2\).
Step 4. Draw \(\boldsymbol\tau^{(s)}\) from the \(\mathcal{N}({\boldsymbol\tau},{V_{\tau}})\) distribution and collect \(\boldsymbol\tau\).
Simulation smoother and precision sampler
Simulation Smoother is used when drawing state variables in discrete time state-space models from their conditional distribution. This tool greatly improved Bayesian estimation process in conditional linear Gaussian state-space models. The benefit of using this method comes from the fact that one does not rely on an iterative procedure of forward filtering and backward smoothing. Instead, this step is performed all without a loop.
Consider sampling random draws from a multivariate normal distribution presented as:
\[ N_{T}(D^{-1}b,D^{-1}) \]
where: \(D\) is a \(T \times T\) precision matrix of covariance matrix \(\Sigma\), such that \(D = \Sigma^{-1}\), and \(b\) is an \(T\times1\) restriction vector to specify the Mean of multivariate normal distribution, where \(M = \Sigma b\).
This parameterisation is particularly useful as matrices \(D\) and \(b\) can be computed easily. Additionally, matrix \(D\) that in the simulation smoother for state-space models is a band matrix and often a tridiagonal matrix. The latter case arises for a one-lag autoregressive state-space process. Therefore, the computation time is much shorter if dedicated numerical algorithms are employed. For tridiagnal matrix \(D\) in state space model, let \(L\) be a lower-triangular matrix obtained by Cholesky decomposition, where \(D = LL'\) (Assume that \(L^{-1}\) can be compute efficiently).
Therefore, the mean of the normal distribution above can be presented as:
\[ M = D^{-1}b = (LL')^{-1}b = L'^{-1}L^{-1}b \]
Let a \(T \times 1\) vector x contain \(T\) independent random draws from a standard normal distribution, \(X\sim N_{T} ( 0_{T}, I_{T})\). Therefore, the target distribution, \(Y\sim N_{T}(D^{-1}b,D^{-1})\), can be treated as:
\[ Y = D^{-1}b + \sqrt{D^{-1}} X = L'^{-1}L^{-1}b + L'^{-1} X \]
Therefor a draw form the target distribution, \(N_{T}(D^{-1}b,D^{-1})\), it’s equivalent to:
\[ L'^{-1}L^{-1}b + L'^{-1}x = L'^{-1}(L^{-1}b + x) \]
Following algorithm base on solving a linear equation by back-substitution rather than computation of the inverse matrix:
- Compute \(L = chol(D)\) such that \(D = LL'\)
- Sample \(x\sim N_{T} ( 0_{T}, I_{T})\)
- Compute a draw from the distribution via the affine transformation: \(L '\smallsetminus (L \smallsetminus b + x)\)
Note: Let \(L \smallsetminus b\) denote the unique solution to the triangular system Lx = b obtained by forward (backward) substitution, that is, \(L \smallsetminus b = L^{−1}b\).
Simulation smoother algorithm in R
Compare the computational speed of generating random numbers from a multivariate normal distribution using dedicated numerical algorithms in function rmvnorm.tridiag.precision
and function solve
for the matrix inversion ignoring the tridiagonality of the precision matrix in function rmvnorm.usual
:
library(mgcv)
= function(n, D, b){
rmvnorm.tridiag.precision = dim(D)[1]
N = diag(D)
lead.diag = sdiag(D, -1)
sub.diag = trichol(ld = lead.diag, sd=sub.diag)
D.chol = diag(D.chol$ld)
D.L sdiag(D.L,-1) = D.chol$sd
= matrix(rnorm(n*N), ncol=n)
x = forwardsolve(D.L, b)
a = backsolve(t(D.L), matrix(rep(a,n), ncol=n) + x)
draw return(draw)
}
# Function of normal method
= function(n, D, b){
rmvnorm.usual = dim(D)[1]
N = t(chol(D))
D.chol = solve(D.chol)
variance.chol = matrix(rnorm(n*N), ncol=n)
x = t(variance.chol) %*%
draw matrix(rep(variance.chol%*%b,n), ncol=n) + x)
(return(draw)
}
# Comparison
set.seed(12345)
= 300
T = rgamma(T, shape=10, scale=10)
md = rgamma(T-1, shape=10, scale=1)
od = 2*diag(md)
D sdiag(D, 1) = -od
sdiag(D, -1) = -od
= as.matrix(rnorm(T))
b
microbenchmark(
trid = rmvnorm.tridiag.precision(n=100, D=D, b=b),
usual = rmvnorm.usual(n=100, D=D, b=b),
check = "equal", setup=set.seed(123456)
)
Warning in microbenchmark(trid = rmvnorm.tridiag.precision(n = 100, D = D, :
less accurate nanosecond times to avoid potential integer overflows
Unit: milliseconds
expr min lq mean median uq max neval
trid 3.099969 3.207758 4.792219 3.252038 3.286868 53.63788 100
usual 22.485384 22.684173 23.063725 22.761212 22.950755 25.30709 100
Therefore, we could clearly see that the computational time drops substatially due to simulation smoothing method.
Analytical solution for a joint posterior distribution
Consider a simplified model with a fixed signal-to-noise ratio \(c\) given by
\[\begin{align} \text{measurement equation} && y &= \tau + \epsilon\\ \text{state equation} && \mathbf{H}\tau &= \eta\\ \text{error term} && \epsilon\mid\tau &\sim \mathcal{N}_T(\mathbf{0}_T, \sigma^2\mathbf{I}_T)\\ \text{innovation} && \eta &\sim \mathcal{N}_T(\mathbf{0}_T, c\sigma^2\mathbf{I}_T) \end{align}\] where \(c\) is a constant, and \(\tau_0 = 0\).
In this model the state equation for \(\tau\) is given by a Gaussian random ralk presented as:
\[\begin{pmatrix} 1 & 0 & \cdots & 0 \\ -1 & 1 & \cdots & 0 \\ \vdots & \ddots & \ddots & \vdots \\ 0 & 0 & -1 & 1 \end{pmatrix} \begin{pmatrix} \tau_1 \\ \tau_2 \\ \vdots \\ \tau_T \end{pmatrix} = \begin{pmatrix} \tau_1 \\ \tau_2-\tau_1 \\ \vdots \\ \tau_T - \tau_{T-1} \end{pmatrix} = \eta\]
The parameters to be estimated are: \(\mathbf{\color{purple}{\tau, \sigma^2}}\)
Prior distributions
Prior distribution for \(\tau\)
Rewrite the state equation as \[\tau = H^{-1}\eta\] Then the prior distribution of \(\tau\) is formed using \[\mathbf{\color{purple}{\eta \sim \mathcal{N}(0_T, c \sigma^2 I_T) \implies H^{-1}\eta \sim \mathcal{N}(0_T, c\sigma^2(H^T H)^{-1})}}\] since \(Var(H^{-1}\eta) = H^{-1}Var(\eta)(H^{-1})^T =c\sigma^2(H^TH)^{-1}\)
Then, the prior distribution of \(\tau|\sigma^2\) is \[\tau|\sigma^2 \sim \mathcal{N}(0_T, c\sigma^2(H^TH)^{-1}) \\ \propto (c\sigma^2)^{-\frac{T}{2}}exp\left(-\frac{\tau^TH^T H\tau}{2c\sigma^2}\right) \]
Prior assumptions for \(\sigma^2\)
\[\sigma^2 \sim \mathcal{IG2}(s_{prior}, \nu_{prior}) \propto (\sigma^2)^\frac{-\nu_{prior}+2}{2}exp\left(-\frac{s_{prior}}{2\sigma^2}\right)\]
The Joint Posterior Distribution
The likelihood function is constructed using the the measurement equation: \[y = \tau + \epsilon\] and the distributional assumption on its error term \[\epsilon \sim \mathcal{N}(0_T, \sigma^2I_T)\] which results in the likelihood function: \[y|\tau, \sigma^2 \sim \mathcal{N}(\tau, \sigma^2I_T)\propto (\sigma^2)^{-\frac{T}{2}}exp\left(-\frac{1}{2\sigma^2}(y-\tau)^T(y-\tau)\right)\]
The likelihood is combined with the joint prior distribution of \(\tau\) and \(\sigma^2\) to derive the joint posterior distribution of \(\tau\) and \(\sigma^2\). It is derived as follows:
\[ p(\tau, \sigma^2 |y) = \frac{p(\tau, \sigma^2, y)}{p(y)} \propto p(\tau, \sigma^2, y) = p(y | \tau, \sigma^2)p(\tau, \sigma^2) =p(y | \tau, \sigma^2)p(\tau|\sigma^2)p(\sigma^2) \]
This expression is proportional to \[\begin{align} &\propto (\sigma^2)^{-\frac{T}{2}}exp\left(-\frac{(y-\tau)^T(y-\tau)}{2\sigma^2}\right) \times (\sigma^2)^{-\frac{T}{2}}\exp\left(-\frac{\tau^TH^T H\tau}{2c\sigma^2}\right)\\ &\qquad\times(\sigma^2)^{-\frac{\nu_{prior}+2}{2}} \exp(-\frac{s_{prior}}{2\sigma^2})\\ &\propto \exp(-\frac{y^Ty - 2\tau^T y + \tau^T\tau + c^{-1}\tau^TH^T H\tau}{2\sigma^2})\\ &\qquad\times \exp(-\frac{s_{prior}}{2\sigma^2})\times(\sigma^2)^{-\frac{2T+\nu_{prior}+2}{2}}\\ &= \exp(-\frac{\tau^T(c^{-1}H^T H + I_T)\tau - 2\tau^Ty}{2\sigma^2})\\ &\qquad\times\exp(-\frac{y^Ty + s_{prior}}{2\sigma^2})\times(\sigma^2)^{-\frac{2T+\nu_{prior}+2}{2}} \end{align}\]
Let \(\bar{\Sigma} = (c^{-1}H^T H + I_T)^{-1}\), then \[\begin{align} p(\tau, \sigma^2 |y) &\propto \exp(-\frac{\tau^T\bar{\Sigma}^{-1}\tau - 2\tau^Ty + y^T\bar{\Sigma}y}{2\sigma^2})\exp(-\frac{y^Ty + s_{prior}-y^T\bar{\Sigma}y}{2\sigma^2})\\ &\qquad\times(\sigma^2)^{-\frac{2T+\nu_{prior}+2}{2}} \end{align}\]
In the expression above, we recognise the kernel of the \(T\)-variate normal inverted gamma 2 distribution. Therefore, the joint posterior of the parameters of the model is given by: \[\begin{align} \tau, \Sigma |y &\sim \mathcal{NIG2}(\bar{\tau}, \bar{\Sigma}, \bar{\nu}, \bar{s})\\[2ex] \bar{\Sigma} &= (c^{-1}H^T H + I_T)^{-1}\\ \bar{\tau} &= \bar{\Sigma}y\\ \bar{\nu} &= 2T+\nu_{prior}\\ \bar{s} &= s_{prior}+y^Ty-y^T\bar{\Sigma}y \end{align}\]
The function below implements the sampler from this joint distribution.
= function(S, starting.values, priors){
UC.local.tau.sigma.Gibbs.sampler
= starting.values
aux = nrow(aux$Y)
T
= list(
posteriors tau = matrix(NA,T,S),
sigma = rep(NA,S)
)
for (s in 1:S){
= priors$c^(-1)*t(aux$H)%*%aux$H + diag(T)
V.tau.bar.inv = 0.5*(V.tau.bar.inv + t(V.tau.bar.inv))
V.tau.bar.inv
# Sampling sigma
###########################
= as.numeric(priors$sigma.s + t(aux$Y)%*%aux$Y - t(aux$tau)%*%V.tau.bar.inv%*%aux$tau)
sigma.s = priors$sigma.nu + 2*T
sigma.nu = sigma.s/rchisq(1,sigma.nu)
sigma.draw$sigma = sigma.draw
aux
# Sampling tau
###########################
= aux$Y
b.tau = t(bandchol(V.tau.bar.inv))
precision.L = rnorm(T)
epsilon = forwardsolve(precision.L, b.tau)
b.tau.tmp = backsolve(t(precision.L), b.tau.tmp + epsilon)
tau.draw $tau = tau.draw
aux
$tau[,s] = aux$tau
posteriors$sigma[,s] = aux$sigma
posteriors
}
= list(
output posterior = posteriors,
last.draw = aux
)return(output)
}
Hierarchical modeling
Estimating gamma error term variance prior scale
To estimate the scale of inverted gamma 2 error term variance, we need to firstly put a prior on the prior scale of and present its full conditional posterior distribution.
The hirercical prior structure is below:
\[\begin{align*} \sigma^2 \mid s &\sim \text{IG2}(s, \nu) \\ s &\sim \mathcal{G}(s_s, a) \end{align*}\]The full conditional posterior distribution of the prior scale \(s\) is based on the prior \(p(\sigma^2 \mid s)\): \[\begin{align} p(\sigma^2 \mid s) \propto s^{\frac{\nu}{2}}(\sigma^2)^{-\frac{\nu+2}{2}} \exp \left( -\frac{1}{2}\frac{s}{\sigma^2} \right) \end{align}\] and that for \(s\): \[\begin{align} s \sim \mathcal{G}(s_s, a) \propto s^{a-1} \exp(-\frac{s}{s_s}) \end{align}\] which results in the following full conditional posterior distribution: \[\begin{align} p(s \mid \mathbf{y}, \mathbf{\tau}, \sigma^2) &= \mathcal{G}(\bar{s}_s,\bar{a})\\ \bar{s}_s &= \left(s_s^{-1} + (2\sigma^2)^{-1}\right)^{-1}\\ \bar{a} &= a + \frac{\nu}{2} \end{align}\]
The code below implements the sampler from the full conditional posterior for \(s\)
= function(sigma2, s_s, a, nu) {
sample_s = a + nu/2
shape_s = 1/((1/s_s) + 1/(2*sigma2))
scale_s = rgamma(1, shape = shape_s, scale = scale_s)
s return(s)
}
Estimating inverted-gamma 2 error term variance prior scale
In this section, we estimate the error term variance prior scale \(s\) that follows a Inverted-gamma 2 distribution with scale \(\underline{s}\) and shape \(\underline{\mu}\): \(s\sim\mathcal{IG2}(\underline{s},\underline{\mu})\). The probability density function equal to:
\[ p\left(s\right) = \Gamma(\frac{\underline{\mu}}{2})^{-1} (\frac{\underline{s}}{2})^{\frac{\underline{\mu}}{2}} s^{-\frac{\underline{\mu}+2}{2}}exp\{ -\frac{1}{2}\frac{\underline{s}}{s}\} \]
As we assume \(\sigma^2|s \sim \mathcal{IG2} (s,\underline{\nu})\)
\[ p\left(\sigma^2|s\right) = \Gamma(\frac{\underline{\nu}}{2})^{-1} (\frac{s}{2})^{\frac{\underline{\nu}}{2}} \sigma^2{^{-\frac{\underline{\nu}+2}{2}}}exp\{-\frac{1}{2}\frac{s}{\sigma^2}\} \]
In order to find full conditional posterior of \(s\) write out its kernel as:
\[ \begin{aligned} p(s|\tau, \sigma^2) &\propto L(\tau, \sigma^2|y) \cdot p(\epsilon|\sigma^2) \cdot p(\sigma^2|s) \cdot p(s) \\ &\propto p(\sigma^2|s) \cdot p(s) \\ &\propto (\frac{s}{2})^{\frac{\underline{\nu}}{2}} exp\{ -\frac{1}{2}\frac{s}{\sigma^2}\} \cdot s^{-\frac{\underline{\mu}+2}{2}}exp\{-\frac{1}{2}\frac{\underline{s}}{s}\} \\ &\propto (\frac{s}{2})^{\frac{\underline{\nu}}{2}} \cdot s^{-\frac{\underline{\mu}+2}{2}} \cdot exp \{-\frac{1}{2}(\frac{\underline{s}}{s} + \frac{s}{\sigma^2} )\} \\ &\propto (\frac{1}{2}^{\frac{\underline{\nu}}{2}}) \cdot s^{\frac{\underline{\nu}}{2}} \cdot s^{-\frac{\underline{\mu}+2}{2}} \cdot exp \{-\frac{1}{2}(\frac{\underline{s}}{s} + \frac{s}{\sigma^2} )\} \\ &\propto s^{\frac{\underline{\nu}-\underline{\mu}-2}{2}} exp\{ -\frac{\frac{1}{\sigma^2}s +\frac{s}{\underline{s}}}{2}\} \end{aligned} \]
from which we recognize a Generalized inverse Gaussian distribution: \(\mathcal{GIG}(\overline{p}, \overline{a}, \overline{b})\) with parameters:
\[\begin{align} \overline{p} &= \frac{\underline{\nu} - \underline{\mu}}{2}\\ \overline{a} &= \frac{1}{\sigma^2 }\\ \overline{b} &= \underline{s } \end{align}\]
The following script illustrates sampling from the full conditional posterior distribution of \(s\) and the function s.sampler
:
= function(sigma2, prior){
s.sampler # sigma2 - the current draw
# prior is a list containing:
# s.prior - a positive scalar
# mu.prior - a scalar
# nu.prior - a scalar
= 1/sigma2
a.bar.s = (prior$nu.prior - prior$mu.prior)/2
b.bar.s = prior$s.prior
p.bar.s
= GIGrvg::rgig(n = 1, lambda = p.bar.s, chi = b.bar.s, psi = a.bar.s)
s
return(s)
}
Estimating the initial condition prior scale
In the univariate local-level model, the initial state \(\tau_0\) and the variance \(s^2\) are key parameters determining the behavior of the stochastic process. For the initial state \(\tau_0\), a prior is specified as a univariate normal distribution, denoted by
\[\tau_0\mid s^2 \sim \text{N}(\underline\tau_0, s^2)\]
reflecting our belief about the distribution of \(\tau_0\) before observing the data. While, for the variance \(s^2\), which controls the fluctuation of the process, we use inverse gamma 2 distribution, \[s^2 \sim \text{IG2}(\underline{s}, \underline{\nu})\]
The full posterior of \(s^2\) is derived below:
\[\begin{align} p(s^2|y,\sigma,\tau,\tau_0,\sigma_{\eta}, \underline{s}, \underline{\nu}) &\propto p(y|\tau,\sigma)p(\tau|\tau_0,\sigma_{\eta}^2)p(\tau_0|s^2)p(s^2) \\ &\propto p(\tau_0|s^2)p(s^2|\underline{s}, \underline{\nu}) \\ &= (s^2)^{-\frac{1}{2}}\exp\left\{-\frac{1}{2}(\tau_0-\underline{\tau_0})'(s^2)^{-1}(\tau_0-\underline{\tau_0})\right\}\\ &\qquad\times (s^2)^{-\frac{\underline{\nu}+2}{2}}\exp\left\{-\frac{1}{2}\frac{\underline{s}}{s^2}\right\} \\ &= (s^2)^{-\frac{1+\underline{\nu}+2}{2}} \exp \left\{-\frac{1}{2}\frac{(\tau_0-\underline{\tau_0})^2+\underline{s}}{s^2}\right\} \end{align}\]
Hence, full conditional posterior of \(s^2\) in a form of inverse gamma 2 distribution, where
\[\begin{align} s^2\mid \mathbf{y}, \mathbf{\tau}, \tau_0 &\sim\mathcal{IG}2(\overline{s}, \overline{\nu})\\ \overline{\nu} &= 1+\underline{\nu} \\ \overline{s} &= (\tau_0-\underline{\tau_0})^2+\underline{s} \end{align}\]
This R code below implements a sampler from the full-conditional posterior distribution of \(s^2\):
<- function(tau_0, tau_0_prior, s_prior, nu_prior) {
s2_sampler
<- nu_prior + 1
nu_posterior <- (tau_0 - tau_0_prior)^2 + s_prior
s_posterior
<- s_posterior / rchisq(1, nu_posterior)
s2
return(s2)
}
Student-t prior for the trend component
Estimating Student-t degrees of freedom parameter
The Student-t distribution is commonly used in statistical modeling to handle data with heavier tails than the normal distribution. An essential parameter of the Student-t distribution is the degrees of freedom \(\nu\), which controls the tail heaviness. In this note, we present the Bayesian estimation of the degrees of freedom parameter for a \(T\)-variate Student-t distribution represented as an inverted gamma 2 scale mixture of normals:
\[\begin{align} \mathbf{y} \mid \boldsymbol\tau, \boldsymbol\lambda, \sigma^2 &\sim \mathcal{N}_T(\boldsymbol\tau, \sigma^2diag(\boldsymbol\lambda))\\ \boldsymbol\lambda &= (\lambda_1, \dots, \lambda_T)'\\ \lambda_t \mid \nu &\sim \mathcal{IG2}(\nu - 2, \nu) \end{align}\]
Therefore the joint prior for vector \(\boldsymbol\lambda\) is proportional to: \[ \Gamma\left(\frac{\nu}{2}\right)^{-T}\left(\frac{\nu-2}{2}\right)^{\frac{T\nu}{2}}\left(\prod_{t=1}^T\lambda_t\right)^{-\frac{\nu+2}{2}}\exp\left\{-\frac{\nu-2}{2}\sum_{t=1^T}\lambda_t^{-1}\right\} \] Assume the following Gamma prior for \(\nu\): \[ p(\nu) \sim \text{Gamma}(a, b) \propto \nu^{a-1}\exp\{-b\nu\} \]
Therefore the kernel of the full conditional posterior of \(\nu\) is defined by: \[ p(\nu\mid\mathbf{y},\boldsymbol\tau,\boldsymbol\lambda,\sigma^2, \nu) \propto p(,\boldsymbol\lambda\mid\nu)p(\nu) \]
This expression does not lead to a full conditional posterior distribution of a known typ, so we use the Metropolis-Hastings algorithm to sample from this posterior. We employ a random walk proposal distribution with a truncated normal distribution centred at the current value of \(\nu\) with a fixed standard deviation.
<- function(nu, lambda, a, b) {
log_kernel_nu
= length(lambda)
T = -T * lgamma(nu / 2) + (T * nu / 2) * log((nu - 2) / 2)
log_c = -((nu + 2) / 2) * sum(log(lambda)) - ((nu - 2) / 2) * sum(1 / lambda)
log_k = (a - 1) * log(nu) - b * nu
log_nu return(log_c + log_k + log_nu)
}
<- function(nu, lambda, a, b, proposal_sd) {
sample_nu
= length(lambda)
T = RcppTN::rtn(nu, proposal_sd, 0, Inf)
nu_proposal = exp(log_kernel_nu(nu_proposal, lambda, a, b) - log_kernel_nu(nu, lambda, a, b))
kernel_ratio = RcppTN::dtn(nu_proposal, nu, proposal_sd, 0, Inf) / RcppTN::dtn(nu, nu_proposal, proposal_sd, 0, Inf)
candidate_ratio = kernel_ratio * candidate_ratio
accept_ratio
if (runif(1) < accept_ratio) {
<- nu_proposal
nu
}
return(nu)
}
Laplace prior for the trend component
To apply the Laplace prior for our trend component, we have \(\lambda_t\) following an exponential distribution with mean alpha, where \(\lambda_t \sim exp({\frac{1}{\alpha}})\), with the prior of the trend component is \(\tau|\eta,\lambda \sim \mathcal{N}(H^{-1}i\tau_0, \sigma_\eta^2 H^{-1}\Omega(H^{-1})')\), then we have the marginal distribution of \(\tau\) is Laplace distribution.
The model is where \(\Omega=diag(\lambda_1,\lambda_2,...,\lambda_t)\), each lambda independently drawn from an exponential distribution.
Due to the change prior distribution of the trend component, now we have different full posterior distributions for parameters and in this note we only focus on the derivation for \(\tau\) and \(\lambda_t\). The change of the prior distribution for \(\tau\) gives the followings:
\[\tau =y - \epsilon\]
\[\tau = H^{-1}i\tau_0 + H^{-1} \eta \]
\[\tau|\eta,\lambda \sim \mathcal{N}(H^{-1}i\tau_0, \sigma_\eta^2 H^{-1}\Omega(H^{-1})') \]
\[\lambda_t \sim exp({\frac{1}{\alpha}})\]
Full-conditional posterior distribution for \(\tau|\tau_0,\sigma^2,\Omega\)
Conditional Likelihood: \[\tau =y - \epsilon\]
\[\epsilon \sim \mathcal{N}(0_T, \sigma^2I_T)\]
\[L(\tau|y,\sigma^2) = exp({-\frac{1}{2}\frac{1}{\sigma^2}(\mathbf{y} - \boldsymbol\tau)'(\mathbf{y} - \boldsymbol\tau)})\]
The prior distribution of \(\tau\)is:
\[p(\tau|\tau_0,\sigma_\eta^2,\Omega) \propto exp(-\frac{1}{2}\frac{1}{\sigma_\eta^2}(\tau - H^{-1}i\tau_0)'H'\Omega^{-1}H(\tau - H^{-1}i\tau_0))\]
Full-conditional posterior distribution of \({\tau}|y,\sigma_\eta^2,\Omega\):
\[\begin{align} P(\tau|y,\tau_0,\sigma^2,\Omega) &\propto L(\tau|y,\sigma^2)\times p(\tau|\tau_0,\sigma_\eta^2,\Omega)\\ &\propto exp({-\frac{1}{2}}\sigma^{-2}\ \times (\tau'\tau -2y'\tau+y'y))\\ &\times exp({-\frac{1}{2}\sigma_\eta^{-2}\ (\tau'H'\Omega^{-1}H\tau - 2\tau'H'\Omega^{-1}H(H^{-1}i\tau_0) +(H^{-1}i\tau_0)'H\Omega^{-1}H(H^{-1}i\tau_0)}\\ &\propto exp({-\frac{1}{2}}\sigma^{-2}\sigma_\eta^{-2}[\tau'(\sigma_\eta^2+\sigma^2H'\Omega^{-1}H)\tau-2\tau'(\sigma_\eta^2y+\sigma^2H'\Omega^{-1}H(H^{-1}i\tau_0))])\\ &= exp({-\frac{1}{2}}(\tau'(\sigma^{-2}+\sigma_\eta^{-2}H'\Omega^{-1}H\tau-2\tau'(\sigma^{-2}y+\sigma_\eta^{-2}H'\Omega^{-1}H(H^{-1}i\tau_0)) \end{align}\]
which results in
\[\begin{align} \tau|y,\tau_0,\sigma^2,\Omega&\sim N(\bar{V_\tau},\bar{\tau})\\[1ex] \bar{V_\tau} &= (\sigma^{-2}+\sigma_\eta^{-2}H'\Omega^{-1}H)^{-1}\\ \bar{\tau} &=\bar{V_\tau} (\sigma^{-2}y+\sigma_\eta^{-2}H'\Omega^{-1}i\tau_0) \end{align}\]
Full-conditional posterior distribution for \(\lambda_t|y,\tau,\sigma_\eta^2\)
Conditional Likelihood:
Define: \(H\tau - i\tau_o=\eta\)
\[\begin{align} L(\lambda_t|y,\tau_0,\tau,\sigma_\eta^2)&\propto (\prod^{T}_{i = 1} \lambda_t^{-\frac{1}{2}}) exp({-\frac{1}{2}\frac{1}{\sigma^2_\eta}(i\tau_0-H\tau)'\Omega^{-1}(i\tau_0-H\tau)})\\ &= (\prod^{T}_{i = 1} \lambda_t^{-\frac{1}{2}}exp({-\frac{1}{2}\frac{1}{\sigma^2_\eta}\frac{1}{\lambda_t}\eta_t'\eta_t})) \end{align}\]
The prior distribution of \(\lambda_t\) is:
\[ P(\lambda_t) ={\frac{1}{\alpha}}exp({{-\frac{1}{\alpha}}\lambda_t)}) \]
Full-conditional posterior distribution of \(\lambda_t\)is:
\[\begin{align} P(\lambda_t|y,\tau,\sigma_\eta^2) &\propto \lambda_t^{-\frac{1}{2}+1-1}exp(-\frac{1}{2}({\frac{\eta_t'\sigma_\eta^{-2}\eta_t}{\lambda_t}+{\frac{2}{\alpha}}\lambda_t)}) \\ \end{align}\]
The above expression can be rearranged in the form of a Generalized inverse Gaussian distribution kernel as follows:
\[\begin{align} \lambda_t|Y,A,\Sigma &\sim GIG(a,b,p) \\ \\ a &=\frac{2}{\alpha} \\ b &= \eta_t'\sigma_\eta^{-2}\eta_t \\ p &= -\frac{1}{2}+1 \end{align}\]
R Function for Gibbs Sampler
= function(S, starting.values, priors){
UC.AR.Gibbs.sampler = starting.values
aux = length(aux$alpha)
p = nrow(aux$Y)
T <- matrix(0, T, 1)
i 1, 1] <- 1
i[= list(
posteriors tau = matrix(NA,T,S),
epsilon = matrix(NA,T,S),
tau_0 = matrix(NA,1,S),
sigma = matrix(NA,2,S)
)
<- 2
alpha .0 <- rexp(T, rate = 1/alpha)
lambda= list(alpha = 2)
lambda.priors = array(NA,c(T,S+1))
lambda.posterior.draws
for (s in 1:S){
if (s == 1) {
= lambda.0
lambda.s else {
} = lambda.posterior.draws[,s]
lambda.s
}
= (diag(lambda.s))
Omega = diag(1/lambda.s)
Omega.inv
# Sampling tau
###########################
= (1/aux$sigma[2]) + (1/aux$sigma[1])*t(H) %*% Omega.inv %*% H
V.tau.inv = 0.5*(V.tau.inv + t(V.tau.inv))
V.tau.inv = (1/aux$sigma[2])%*%aux$Y + (1/aux$sigma[1])*crossprod(H, Omega.inv%*%i%*%priors$tau_0)
b.tau = t(bandchol(V.tau.inv))
precision.L = rnorm(T)
epsilon = forwardsolve(precision.L, b.tau)
b.tau.tmp = backsolve(t(precision.L), b.tau.tmp + epsilon)
tau.draw $tau = tau.draw
aux
# Sampling tau_0
###########################
= diag(1/priors$tau_0.v)
tau_0.v.inv = 1/((1/aux$sigma[1])*crossprod(i,Omega.inv%*%i) + tau_0.v.inv)
V.tau_0.bar = V.tau_0.bar %*% ( (1/aux$sigma[1])%*%t(i)%*%Omega.inv%*%H%*%aux$tau + tau_0.v.inv * priors$tau_0.m )
tau_0.bar = rnorm(1,as.numeric(tau_0.bar), sqrt(V.tau_0.bar))
tau_0.draw $tau_0 = as.vector(tau_0.draw)
aux
# Sampling sigma_epsilon
###########################
= as.numeric(priors$sigma.s + crossprod((c(aux$tau_0[1],diff(aux$tau_0)) - i%*%aux$tau_0),(priors$H%*%aux$tau - i%*%aux$tau_0)))
sigma.eta.s = priors$sigma.nu + T
sigma.eta.nu = sigma.eta.s/rchisq(1,sigma.eta.nu)
sigma.eta.draw=1/(sigma.eta.draw)
sigma.eta.draw.inv
= as.numeric(priors$sigma.s + crossprod(aux$tau-aux$Y))
sigma.e.s = priors$sigma.nu + T
sigma.e.nu = sigma.e.s/rchisq(1,sigma.e.nu)
sigma.e.draw $sigma = c(sigma.eta.draw,sigma.e.draw)
aux
# Sampling lambda
###########################
= H%*%aux$tau[,,s]-i%*%aux$tau_0[,,s]
u.t # ---- loop lambda posterior ---- #
= -1/2 + 1
c = 2 / lambda.priors$alpha
a for (x in 1:T){
= t((u.t)[x,])%*%(1/aux$sigma[1][,,s])%*%(u.t)[x,]
b +1] = GIGrvg::rgig(1, lambda = c, chi = b, psi = a)
lambda.posterior.draws[x,s
}
$lambda = lambda.posterior.draws
aux
$tau[,s] = aux$tau
posteriors$tau_0[,s] = aux$tau_0
posteriors$lambda[,s] = aux$lambda
posteriors$sigma[,s] = aux$sigma
posteriors
}
= list(
output posterior = posteriors,
last.draw = aux
)return(output)
}
Model extensions
Estimation of autoregressive parameters for cycle component
We consider an extension of the model by a zero-mean autoregressive equation for \(\epsilon_{t}\):
\[\begin{align} \epsilon_{t}=\alpha_{1}\epsilon_{t-1}+...+\alpha_{p}\epsilon_{t-p}+e_{t} \end{align}\]
Where:
\[\begin{align} e_{t}\sim\mathcal{N}\left(0,\sigma_e^{2}\right) \end{align}\]
Define the autoregressive parameter vector \(\alpha=\left(\alpha_{1},...,\alpha_{p}\right)'\). The prior distribution for \(\alpha\) is then:
\[\begin{align} \alpha\sim\mathcal{N}_{p}\left(\underline{\alpha},\sigma_{\alpha}^{2}I_{p}\right)\mathcal{I}\left(\alpha\in A\right)\propto\exp\left(-\frac{1}{2}\frac{1}{\sigma_{\alpha}^{2}}\left(\alpha-\underline{\alpha}\right)'\left(\alpha-\underline{\alpha}\right)\right)\mathcal{I}\left(\alpha\in A\right) \end{align}\]
Where \(\alpha\in A\) denotes the set of parameters for \(\alpha\) in which stationarity holds:
\[\begin{align} \mathcal{I}\left(\alpha\in A\right)=\begin{cases} 1 & \text{if }\alpha\in A\\ 0 & \text{otherwise} \end{cases} \end{align}\] This condition is not implemented in the code though.
Using from the simple unobserved component model:
\[\begin{align} e=\epsilon-X_{\epsilon}\alpha \end{align}\] where \(\epsilon\) and \(X_{\epsilon}\) are \(T\times 1\) and \(T\times p\) matrices respectively collecting the terms from the autoregressive equation for all periods.
Then we have the likelihood: \[\begin{align} L\left(\alpha|\mathbf{y},\boldsymbol\epsilon,\sigma_{\alpha}^{2}\right)\propto\exp\left(-\frac{1}{2\sigma_e^{2}}\left(X_{\epsilon}\alpha-\epsilon\right)'\left(X_{\epsilon}\alpha-\epsilon\right)\right) \end{align}\]
So the full-conditional posterior distribution for \(\alpha\) is then given by: \[\begin{align} p\left(\alpha|y,\epsilon,\sigma_{\alpha}^{2}\right)&\propto L\left(\alpha|y,\epsilon,\sigma_{\alpha}^{2}\right)p\left(\alpha\right)\mathcal{I}\left(\alpha\in A\right)\\ &=\exp\left(-\frac{1}{2\sigma_e^{2}}\left(X_{\epsilon}\alpha-\epsilon\right)'\left(X_{\epsilon}\alpha-\epsilon\right)\right)\exp\left(-\frac{1}{2}\frac{1}{\sigma_{\alpha}^{2}}\left(\alpha-\underline{\alpha}\right)'\left(\alpha-\underline{\alpha}\right)\right)\mathcal{I}\left(\alpha\in A\right)\\ &=\exp\left(-\frac{1}{2\sigma_{\alpha}^{2}}\left(X_{\epsilon}\alpha-\epsilon\right)'\left(X_{\epsilon}\alpha-\epsilon\right)+\frac{1}{\sigma_{\alpha}^{2}}\left(\alpha-\underline{\alpha}\right)'\left(\alpha-\underline{\alpha}\right)\right)\mathcal{I}\left(\alpha\in A\right)\\ &=\exp\left(-\frac{1}{2}\left(\alpha'\overline{V}_{\alpha}\alpha-2\alpha'\overline{V}_{\alpha}^{-1}\overline{\alpha}+...\right)\right)\mathcal{I}\left(\alpha\in A\right) \end{align}\]
Hence we have the full-conditional posterior as:
\[\begin{align} p\left(\alpha|y,\epsilon,\sigma_{\alpha}^{2}\right) &=\mathcal{N}_{p}\left(\overline{\alpha},\overline{V}_{\alpha}\right)\mathcal{I}\left(\alpha\in A\right)\\ \overline{V}_{\alpha}&=\left[\sigma_{e}^{-2}X_{\epsilon}'X_{\epsilon}+\sigma_{\alpha}^{-2}\mathbf{I}_{p}^{-1}\right]^{-1}\\ \overline{\alpha}&=\overline{V}_{\alpha}\left[\sigma_{e}^{-2}X_{\epsilon}'\epsilon+\sigma_{\alpha}^{-2}\underline{\alpha}\right] \end{align}\]
= function(epsilon, X_epsilon, priors, sigma2_e){
sample_alpha
= ncol(X_epsilon)
p
# full conditional posterior distribution alpha
= solve((1/priors[2]) * diag(p) + (1/sigma2_e)* t(X_epsilon)%*%X_epsilon)
V.alpha.bar = V.alpha.bar*( priors[1]/priors[2] + (1/sigma2_e)*as.numeric(t(X_epsilon)%*%epsilon) )
alpha.bar = mvtnorm::rmvnorm(1, alpha.bar, V.alpha.bar)
draw
return(draw)
}
Autoregressive cycle component
In this section, we will modify the error component \(\epsilon_{t}\) from the simple local-level model to take the form of an autoregressive (AR) expression as follows \[\begin{align} \epsilon_{t} &= \alpha_{1}\epsilon_{t-1} + \alpha_{2}\epsilon_{t-2} + \cdots + \alpha_{p}\epsilon_{t-p} + e_{t},\\ e_{t}&\sim \mathcal{N}(0,\sigma_{e}^{2}). \end{align}\]
Collect vector of autoregressive parametersin a \(p\)-vector \(\boldsymbol\alpha = (\alpha_{1},\alpha_{2}, \cdots, \alpha_{p})'\), where \(\boldsymbol\alpha \in A\), with the stationarity restriction being required to hold.
By modelling the error component in an autoregressive form, we enable our model to capture the cyclical component, capturing its negative autocorrelations, and allowing the series to deviate from the long-term stochastic trend (\(\tau_{t}\)) in the short run.
The matrix notation can be represented as:
\[\begin{align} \mathbf{H}_{\alpha} \boldsymbol\epsilon &= \mathbf{e} \\ \boldsymbol\epsilon &= H_{\alpha}^{-1}\mathbf{e} \\ \mathbf{e} &\sim \mathcal{N}_{T}(\mathbf{0}_{T},\sigma_{e}^{2} \mathbf{I}_{T}) \end{align}\]Where
\[ \mathbf{H}_{\alpha} = \begin{bmatrix} 1 &0 &0 &0 &0 &\cdots &0 &0 &0 &0 \\ -\alpha_{1} &1 &0 &0 &0 &\cdots &0 &0 &0 &0 \\ -\alpha_{2} &-\alpha_{1} &1 &0 &0 &\cdots &\vdots &\vdots &\vdots &\vdots \\ -\alpha_{3} &-\alpha_{2} &-\alpha_{1} &1 &0 &\ddots &\vdots &\vdots &\vdots &\vdots \\ \vdots &\vdots &\ddots &\ddots & \ddots &\ddots &\vdots &\vdots &\vdots &\vdots \\ -\alpha_{p} &-\alpha_{p-1} &-\alpha_{p-2} &\ddots &\ddots &\ddots &0 &0 &0 &0 \\ \vdots &\vdots &\vdots &\vdots &\ddots &\ddots &\ddots &\vdots &\vdots &\vdots \\ \vdots &\vdots &\vdots &\vdots &\cdots &\ddots &-\alpha_{1} &1 &0 &0 \\ 0 &0 &0 &0 &\cdots &\cdots &-\alpha_{2} &-\alpha_{1} &1 &0 \\ 0 &0 & 0 &0 &\cdots &\cdots &-\alpha_{3} &-\alpha_{2} &-\alpha_{1} &1 \end{bmatrix}_{T\times T} \]
\[\begin{align} \boldsymbol\epsilon = \begin{bmatrix} \epsilon_{1} \\ \epsilon_{2}\\ \vdots\\ \epsilon_{T}\\ \end{bmatrix}_{T\times 1} \qquad and \qquad \mathbf{e} = \begin{bmatrix} e_{1} \\ e_{2}\\ \vdots\\ e_{T}\\ \end{bmatrix}_{T\times 1} \end{align}\]Typically, in most cases, we set the initial value (\(\boldsymbol\epsilon_{0}\)) to zero since it aligns with the well-defined stationarity assumption. However, in this note, these values are estimated \(\boldsymbol\epsilon_{0}\) from an autoregressive model, which is facilitated by assuming a normal prior distribution: \(\boldsymbol\epsilon_{0} \sim \mathcal{N}_{p}(\mathbf{0}_{p},\sigma_{0}^{2}\mathbf{I}_{p})\) and that \(\mathbf{e}_{0} \sim \mathcal{N}_{p}(\mathbf{0}_{p},\sigma_{e}^{2}\mathbf{I}_{p})\). Define:
\[\begin{align} \boldsymbol\epsilon_{0} = \begin{bmatrix} \epsilon_{-p+1} \\ \epsilon_{-p+2} \\ \vdots\\ \epsilon_{0} \end{bmatrix}_{p\times 1}, \qquad and \qquad \mathbf{e}_{0} = \begin{bmatrix} e_{-p+1} \\ e_{-p+2} \\ \vdots\\ e_{0} \end{bmatrix}_{p\times 1} \end{align}\]Then, the author will combine \(\boldsymbol\epsilon\) and \(\boldsymbol\epsilon_{0}\) together. This can be represented using the notations \(\mathbf{H}_{\alpha}^{*}\), \(\boldsymbol\epsilon^{*}\), and \(\mathbf{e}^{*}\).
\(\mathbf{H}_{\alpha}^{*}\) is constructed in the same way as \(\mathbf{H}_{\alpha}\), but the dimension increases to \((T+p)\times (T+p)\).
\[\begin{align} \boldsymbol\epsilon^{*} = \begin{bmatrix} \epsilon_{0} \\ \epsilon \end{bmatrix}_{(T+p)\times 1} = \begin{bmatrix} \epsilon_{-p+1} \\ \epsilon_{-p+2} \\ \vdots\\ \epsilon_{1} \\ \epsilon_{2} \\ \vdots\\ \epsilon_{T} \end{bmatrix}_{(T+p)\times 1} \qquad and \qquad \mathbf{e}^{*} = \begin{bmatrix} e_{0} \\ e \end{bmatrix}_{(T+p)\times 1} = \begin{bmatrix} e_{-p+1} \\ e_{-p+2} \\ \vdots\\ e_{1} \\ e_{2} \\ \vdots\\ e_{T} \end{bmatrix}_{(T+p)\times 1} \end{align}\]The matrix notation can be represented as:
\[\begin{align} \mathbf{H}_{\alpha}^{*} \boldsymbol\epsilon^{*} &=\mathbf{e}^{*} \\ \boldsymbol\epsilon^{*} &= \mathbf{H}^{*-1}_{\alpha}\mathbf{e}^{*} \\ \mathbf{e}^{*} &\sim \mathcal{N}_{T+p}(\mathbf{0}_{T+p},\sigma_{e*}^{2}\mathbf{I}_{T+p}) \end{align}\]Then
\[\begin{align} \boldsymbol\epsilon^{*}|\boldsymbol\alpha,\sigma_{e}^2 &\sim \mathcal{N}_{T+p}(\mathbf{0}_{T+p},\underbrace{\sigma_{e*}^2(\mathbf{H}_{\alpha}^{*'}\mathbf{H}_{\alpha}^{*})^{-1}}_{{\boldsymbol\Omega}}) \end{align}\] \[\begin{align} \boldsymbol\Omega = \begin{bmatrix} \boldsymbol\Omega_{11}& \boldsymbol\Omega_{12}\\ \boldsymbol\Omega_{21}& \boldsymbol\Omega_{21} \end{bmatrix} \\ \end{align} \text{ with dimensions} \begin{bmatrix} pXP&pXT\\ TXp&TXT \end{bmatrix}_{(T+p)X(T+p)} \\ \]The prior distribution
Using knowledge from the conditional distributions of multivariate normal distribution, we can find the prior distribution of \(\boldsymbol\epsilon \text{ and } \boldsymbol\epsilon_{0}\) as follows.
- The prior distribution of \(\boldsymbol\epsilon\)
- The prior distribution of \(\boldsymbol\epsilon_{0}\)
Likelihood
From the simple local-level model we can write
\[\begin{align} \mathbf{y} &= \boldsymbol\tau + \boldsymbol\epsilon,\\ \mathbf{H}\boldsymbol\tau &= \mathbf{i} \tau_0 + \boldsymbol\eta,\\ \boldsymbol\tau &= \mathbf{H}^{-1} \mathbf{i} \tau_0 + \mathbf{H}^{-1} \boldsymbol\eta \end{align}\]Then,
\[\begin{align} \boldsymbol\epsilon &= \mathbf{y} - \boldsymbol\tau \\ \boldsymbol\epsilon &\sim N(\mathbf{y}-\mathbf{H}^{-1} \mathbf{i} \tau_0, \sigma_{\eta}^2 \mathbf{I}_{T} (\mathbf{H}'\mathbf{H})^{-1})\\ L(\boldsymbol\epsilon|\mathbf{y},\boldsymbol\tau,\tau_0, \sigma_{\eta}^2) &\propto exp\{-\frac{1}{2}\frac{1}{\sigma_{\eta}^2}(\boldsymbol\epsilon-(\mathbf{y}-\mathbf{H}^{-1} \mathbf{i} \tau_0))' \mathbf{H}'\mathbf{H} (\boldsymbol\epsilon-(\mathbf{y}-\mathbf{H}^{-1} \mathbf{i} \tau_0)) \end{align}\]The full-conditional posterior distribution
- The full-conditional posterior distribution of \(\boldsymbol\epsilon\)
- The full-conditional posterior distribution of \(\boldsymbol\epsilon_{0}\)
The Sampler function
Given that we have priors and other sampling parameters from another function, and also some fixed matrices such as \(\mathbf{H}'\mathbf{H}\) and \(\mathbf{H^{*}}_{\alpha}'\mathbf{H^{*}}_{\alpha}\).
We can sampling each of \(\epsilon\) and \(\epsilon_{0}\) from the following function
= function(starting.values, priors){
Sampling.epsilon.epsilonzero
= starting.values
aux = length(aux$alpha)
p = nrow(aux$Y)
T = diag(T + p)
Ha for (i in 1:p) {
::sdiag(Ha, -i) <- - aux$alpha[i]
mgcv
}= crossprod(Ha)
HaHa
# Start calculating (and updating parameters)
###########################
= rbind(aux$epsilon.zero, aux$epsilon)
epsilon.star = sigma.e.star%*%solve(HaHa)
omega = omega[1:p, 1:p]
omega_11 = omega[1:p, (p+1):(p+T)]
omega_12 = omega[(p+1):(p+T), 1:p]
omega_21 = omega[(p+1):(p+T), (p+1):(p+T)]
omega_22 = omega_21%*%solve(omega_11)%*%aux$epsilon.zero
epsilon = omega_12%*%solve(omega_22)%*%aux$epsilon
epsilon.zero = omega_22 - omega_21%*%solve(omega_11)%*%omega_12
V.epsilon = omega_11 - omega_12%*%solve(omega_22)%*%omega_21
V.epsilon.zero
# Sampling epsilon
###########################
= solve(V.epsilon) + (1/aux$sigma.eta)*HH
V.epsilon.post.inv = (1/aux$sigma.eta)*HH%*%(aux$Y- cumsum(i%*%priors$tau_0)) +
b.epsilon solve(V.epsilon)%*%aux$epsilon
= t(chol(V.epsilon.post.inv))
precision.L = rnorm(T)
epsilon.n = solve(precision.L, b.epsilon)
b.epsilon.tmp = solve(t(precision.L), b.epsilon.tmp + epsilon.n)
epsilon.draw $epsilon = epsilon.draw
aux
# Sampling epsilon.zero
###########################
= solve(V.epsilon.zero) + (1/sigma.zero)*diag(p)
V.epsilon.zero.post.inv = epsilon.zero
b.epsilon.zero = t(chol(V.epsilon.zero.post.inv))
precision.L = rnorm(p)
epsilon.n = solve(precision.L, b.epsilon.zero)
b.epsilon.zero.tmp = solve(t(precision.L), b.epsilon.zero.tmp + epsilon.n)
epsilon.zero.draw $epsilon.zero = epsilon.zero.draw
aux
# The draw
= list(
output epsilon_draw = epsilon.draw, # Drawn epsilon values
epsilon_zero_draw = epsilon.zero.draw # Drawn epsilon.zero values
)return(output)
}
Random walk with time-varying drift parameter
Consider a simple Unobserved Component model with time-varying drift parameter \[\begin{align} \tau_{t} &= \mu_{s_t} + \tau_{t-1} + \eta_t,\\ \eta_t &\sim \mathcal{N}(0, \sigma_{\eta}^2), \end{align}\] where \(\mu_{s_t}\) is a regime-dependent mean term, and \(s_t\) is a regime state variable. Assume there are two regimes, for example, before and after the exchange rate crisis in 1985. Define:
\[s_t = \left\{\begin{array}{ll}1 &\text{ for }t < Tb \\ 2 &\text{ for }t \geq Tb\end{array}\right.\]
To write a model in a matrix notation, define vector \(\boldsymbol\beta = \begin{bmatrix}\tau_0 &\mu_1 &\mu_2\end{bmatrix}'\) and matrix: \[\begin{align} \mathbf{X}_{\tau} = \begin{bmatrix} 1 & 1 & 0 \\ 0 & 1 & 0 \\ 0 & 1 & 0 \\ \vdots & \vdots & \vdots \\ 0 & 1 & 0 \\ 0 & 0 & 1 \\ 0 & 0 & 1 \\ \vdots & \vdots & \vdots \\ 0 & 0 & 1 \end{bmatrix} \end{align}\] Then the model is written as \[ \mathbf{H}\boldsymbol\tau = \mathbf{X}_{\tau}\boldsymbol\beta+\boldsymbol\eta\\ \] Prior distribution \[p(\boldsymbol\beta) = exp\left(-\frac{1}{2}(\boldsymbol\beta - \underline{\boldsymbol\beta})'\mathbf{\underline{V}}_\beta^{-1}(\boldsymbol\beta - \underline{\boldsymbol\beta})\right)\]
Full conditional posterior distribution \[\begin{align} p(\boldsymbol\beta|y,\boldsymbol\tau,\sigma_\eta^2) &= \mathcal{N}_3(\overline{\boldsymbol\beta},\mathbf{\overline{V}}_\beta)\\[1ex] \mathbf{\overline{V}}_{\beta} &= [\sigma_{\eta}^{-2}\mathbf{X}_\tau'\mathbf{X}_\tau+\mathbf{\underline{V}}_\beta^{-1}]^{-1}\\ \bar{\boldsymbol\beta} &= \mathbf{\overline{V}}_{\beta}[\sigma_{\eta}^{-2}\mathbf{X}_\tau'\mathbf{H}\boldsymbol\tau + \mathbf{\underline{V}}_\beta^{-1}\underline{\boldsymbol\beta}] \end{align}\]
# function to draw beta
<- function(Tb, tau, sigma_eta_sq, beta_prior, V_prior) {
draw_beta
= length(tau)
T
<- matrix(0, nrow = T, ncol = 3)
X_tau 1, 1] <- 1 # Column for tau_0
X_tau[1:(Tb-1), 2] <- 1 # Column for mu1 for t < Tb
X_tau[:T, 3] <- 1 # Column for mu2 for t >= Tb
X_tau[Tb
# Define H
<- diag(T)
H for (i in 2:T) {
-1] <- -1
H[i, i
}
<- solve(V_prior)
V_prior_inv
<- (1/sigma_eta_sq) * t(X_tau) %*% X_tau + V_prior_inv
V_bar_inv <- solve(V_bar_inv)
V_bar <- V_bar %*% ((1/sigma_eta_sq) * t(X_tau) %*% (H %*% tau)
beta_bar + V_prior_inv %*% beta_prior)
# Draw a sample from the multivariate normal distribution
<- mvrnorm(1, beta_bar, V_bar)
beta_posterior
return(beta_posterior)
}
Student-t error terms
T-distributed error terms enhance robustness against outliers in data. This section outlines a method to incorporate t-distributed error terms into unobserved component models where the error term structure is as follows:
\[ \boldsymbol\epsilon\sim N_{T}(0_{T},{\sigma^{2}}diag({\boldsymbol\lambda)}) \]
where vector \(\boldsymbol\lambda = (\lambda_1, \dots, \lambda_T)'\) collects th auxiliary latent variables. Each \(\lambda_{t}\) follows independently the following prior: \[\lambda_t\sim\mathcal{IG}2(\nu,\nu)\] which makes \(\epsilon_{t}\) marginally Student-t-distributed.
The likelihood is given by:
\[ L(\mathbf{y}|\boldsymbol\tau,\sigma^{2},\boldsymbol\lambda) = (\sigma^2)^{-\frac{T}{2}}\left(\prod_{t=1}^{T}(\lambda_{t})^{-\frac{1}{2}}\right) \exp\left\{-\frac{1}{2}\frac{1}{\sigma^{2}}\sum_{t=1}^{T}\frac{1}{\lambda_{t}}(y_t-\tau_t)^2\right\}. \]
The prior density of is proportional to
\[ \lambda_{t}^{-\frac{\nu+2}{2}} \exp\left\{-\frac{1}{2}\frac{\nu}{\lambda_{t}}\right\}. \]
Combining the likelihood and the prior
\[\begin{align} p(\lambda_{t}|y,\tau,\sigma^{2})&\propto L(\lambda_{t},\tau,\sigma^{2}|y)p(\lambda_{t}|\nu_{\lambda})\\ &= (\lambda_{t})^{-\frac{{1}}{2}}\exp\{-\frac{1}{2}\frac{1}{\sigma^{2}}\frac{1}{\lambda_{t}}(y-\tau)'(y-(\tau)\} \lambda_{t}^{-\frac{\nu+2}{2}} \exp\left\{-\frac{1}{2}\frac{\nu}{\lambda_{t}}\right\} \end{align}\]
Yields the posterior kernel
\[ (\lambda_{t})^{-\frac{_{1+\nu+2}}{2}}\exp\left\{-\frac{1}{2}\frac{1}{\lambda_{t}}\left[ \nu + \frac{\epsilon_t^2}{\sigma^{2}}\right]\right\} \]
where \(\epsilon_{t}=y_{t}-\tau_{t}\). Therefore, the full conditional posterior for each \(\lambda_t\) is:
\[IG2\sim(1+\nu,\nu+\frac{\epsilon_{t}^2}{\sigma^{2}})\]
<- function(y_t,tau_t,sigma2, nu){
lambda_t
<- y_t - tau_t
e_t <- MCMCpack::rinvgamma(1, (1 + nu)/2, (nu + (e_t^2) / sigma2)/2)
lambda
return(lambda)
}
Conditional heteroskedasticity
Modelling Conditional heteroskedasticity includes the use of Stochastic Volatility models, which provides more flexibility in forecasting, enhancing its performance, improves in-sample fit of model and estimation precision.
Basic model
Given that the shocks \(\epsilon_t\) is heteroskedastic with conditional variances \(\sigma_t^2\) the distribution of the shock becomes
\[ \boldsymbol \epsilon \mid \tau \sim \mathcal{N}_T \left( 0_T, \operatorname{diag}(\boldsymbol\sigma^2) \right) \] where \(\boldsymbol\sigma^2 = \left(\sigma^2_1 , \ldots, \sigma^2_T \right)\) which follows a Stochastic Volatility process.
The likelihood function is derived as:
\[ L(\boldsymbol\tau,\sigma^2|\mathbf{y}) = (2\pi)^{-\frac{T}{2}}\left(\operatorname{diag}(\boldsymbol\sigma^2)\right)^{-\frac{1}{2}}\exp\left\{-\frac{1}{2}(\mathbf{y} - \boldsymbol\tau)'\operatorname{diag}(\boldsymbol\sigma^2)^{-1}(\mathbf{y} - \boldsymbol\tau)\right\}. \]
The prior distribution for \(\boldsymbol\tau\) is :
\[\begin{align} \boldsymbol\tau \mid \tau_{0}, \sigma_\eta^2 &\sim \mathcal{N}_T \left( \mathbf{H}^{-1} \mathbf{i}\tau_{0}, \sigma_\eta^2 (\mathbf{H'H})^{-1} \right)\\ &\propto \exp \left\{ -\frac{1}{2} \frac{1}{\sigma_\eta^2} \left( \tau - H^{-1} \mathbf{i}\tau_{0} \right)' H' H \left( \tau - H^{-1} \mathbf{i}\tau_{0}\right) \right\} \end{align}\]
Then derive the Full Conditional Posterior for \(\boldsymbol\tau\) is :
\[\begin{align} p(\boldsymbol\tau \mid \mathbf{y}, \tau_{0}, \sigma_\eta^2, \sigma^2) &\propto L(\boldsymbol\tau,\sigma^2|\mathbf{y})p(\tau \mid \tau_{0}, \sigma_\eta^2)\\ &\propto exp\left\{-\frac{1}{2}(\mathbf{y} - \boldsymbol\tau)'\operatorname{diag}(\boldsymbol\sigma^2)^{-1}(\mathbf{y} - \boldsymbol\tau)\right\} \\ &\qquad\exp \left\{ -\frac{1}{2} \frac{1}{\sigma_\eta^2} \left( \boldsymbol\tau - \mathbf{H}^{-1} \mathbf{i}\tau_{0} \right)' \mathbf{H' H} \left( \boldsymbol\tau - \mathbf{H}^{-1} \mathbf{i}\tau_{0}\right) \right\} \end{align}\]
The posterior parameters for \(\tau\) follows a normal distribution
\[\begin{align} \boldsymbol\tau \mid \mathbf{y}, \tau_{0}, \sigma_\eta^2, \sigma^2&\mathcal{N}_T \left( \bar{\boldsymbol\tau}, \bar{\mathbf{V}}_\tau \right)\\ \bar{\mathbf{V}}_\tau &= \left[ \operatorname{diag}(\boldsymbol\sigma^2)^{-1} + \sigma_\eta^{-2} \mathbf{H' H} \right]^{-1}\\ \bar{\boldsymbol\tau} &= \bar{\mathbf{V}}_\tau \left[ \operatorname{diag}(\boldsymbol\sigma^2)^{-1} \mathbf{y} + \sigma_\eta^{-2} \mathbf{H}'\mathbf{i}\tau_{0} \right] \end{align}\]
Algorithm
The sampler for posterior parameters for \(\boldsymbol\tau\) is:
= function(y, aux){
tau_hetero.sampler #aux includes
# sigma2 = a diagonal matrix of sigma2 from Stochastic Volatility models
# tau0 = a scalar of initial value of tau
# sigma_eta2 = a scalar sigma_eta^2
#y = data
= nrow(y)
T1
# i_m = Tx1 matrix with 1 in the first row and 0 in all other rows
= matrix(0, nrow = T1, ncol = 1)
i_m 1,] = 1
i_m[
<- diag(T1)
H ::sdiag(H,-1) <- -1
mgcv# H = TxT matrix with 1 in main diagonal and -1 under the diagonal elements
= aux$sigma2
sigma2 = diag(1/diag(sigma2)) #get inverse
sigma2.inv = aux$tau0
tau0 = aux$sigma_eta2
sigma_eta2 = 1/sigma_eta2 #get inverse
sigma_eta2.inv
#compute the posterior parameters
= solve(sigma2.inv + sigma_eta2.inv*t(H)%*%H)
V_post = V_post%*%(sigma2.inv%*%y + sigma_eta2.inv * t(H)%*%(i_m*tau0))
tau_post
#sample tau from multivariate normal and return the sample tau
<- mvtnorm::rmvnorm(T1, tau_post, V_post)
tau_draw
return (tau_draw)
}
Bayesian forecasting
Bayesian forecasting is a coherent way of predicting the future probabilistically, the forecast is summarized by a predictive density, which is the probability distribution of future data condition only on observed data.
Predictive density
The predictive density is derived by writing down the joint distribution of future data and parameters condition on observed data, then integrating out the parameters. By integrating out the posterior distribution of parameters, the predictive density accurately accounts for estimation uncertainty.
The h-period ahead predictive density is given by:
\[ \begin{aligned} p(y_{T+h}, \dots, y_{T+1} | \mathbf y) = \int\int &p(y_{T+h} | \tau_{T+h}, \sigma^{2}) p(\tau_{T+h} | \tau_{T+h-1}, \sigma^{2}_\eta)\\ &\dots\\ &\times p(y_{T+1} | \tau_{T+1}, \sigma^{2}) p(\tau_{T+1} | \tau_{T}, \sigma^{2}_\eta)\\ & \times p(\tau_{T}, \sigma^{2}, \sigma^{2}_\eta | \mathbf y) \ d\left(\tau_{T+h},\dots,\tau_{T+1}\right) d\left(\tau_{T}, \sigma^2,\sigma^2_\eta\right) \\ \end{aligned} \]
Sampling from the predictive density
The form of the predictive density above suggests the following general algorithm for a h-periods ahead forecast.
For each posterior draw \(\left\{ \tau_T^{(s)}, \sigma^{2(s)}, \sigma^{2(s)}_\eta \right\}\):
For \(i=1,\cdots,h:\)
Sample \(\tau_{T+i}^{(s)} \sim \mathcal N(\tau_{T+i-1}^{(s)}, \sigma^{2(s)}_\eta)\)
Sample \(y_{T+i}^{(s)} \sim \mathcal N(\tau_{T+i}^{(s)}, \sigma^{2(s)})\)
Return \(\left\{ y_{T+1}^{(s)},\cdots,y_{T+h}^{(s)} \right\}_{s=1}^S\)
Here is an implementation in R
:
#' @title h-step ahead Bayesian forecasting of the local-level model
#'
#' @param h Number of periods ahead
#' @param posterior.tauT Sx1 vector of trend component at time T
#' @param posterior.sigma2 Sx1 vector of observation equation error variance
#' @param posterior.sigma2eta Sx1 vector of state equation error variance
#'
#' @return hxS matrix of forecast values
= function(h,
forecast
posterior.tauT,
posterior.sigma2,
posterior.sigma2eta) {
= length(posterior.tauT)
S = matrix(NA, nrow = h, ncol = S)
y
for (s in 1:S) {
= posterior.tauT[s]
tauT = posterior.sigma2[s]
sigma2 = posterior.sigma2eta[s]
sigma2eta
for (t in 1:h) {
= rnorm(1, tauT, sqrt(sigma2eta))
tauT1 = rnorm(1, tauT1, sqrt(sigma2))
y[t, s]
}
}
return(y)
}
References
Citation
@online{woźniak2024,
author = {Woźniak, Tomasz and Berke, Stephan and Buan, Carl and Chen,
Gezhi and Gussen, Ben and Kang, Inhye and Li, Zheyuan and Liu, Rui
and Pang, Qingqing and Pham, Nhu and Suparattanapinan, Pun and Yang,
Yansong and Yang, Yuqiao and Wang, Adam and Wu, Yufei},
title = {Bayesian {Unobserved} {Component} {Models}},
date = {2024-05-01},
url = {https://donotdespair.github.io/Bayesian-Unobserved-Component-Models/},
doi = {10.26188/25814617},
langid = {en}
}