# Self-concordant empirical likelihood for a vector mean,
# as described in:
#
# @article{owen:2013,
# title={Self-concordance for empirical likelihood},
# author={Owen, A. B.},
# journal={Canadian Journal of Statistics},
# volume={41},
# number={3},
# pages={387--397},
# year={2013}
# }
# Copyright 2014-2015 Board of Trustees, Leland Stanford Junior University
#
# Released under BSD-3-Clause
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# This file has two callable functions:
# emplik does one EL calculation,
# tracelr calls emplik on a trajectory from mu0 to mu1 in N+1 steps
# emplik returns:
# logelr log empirical likelihood ratio
# lam Lagrange multiplier (vector of length d)
# wts observation weights/probabilities (vector of length n)
# converged TRUE if algorithm converged
# FALSE usually means that mu is not in the convex hull of the data
# you then get a very small likelihood (instead of zero)
# iter number of iterations taken
# ndec Newton decrement (see Boyd & Vandenberghe)
# gradnorm norm of gradient of log empirical likelihood
# tracelr returns: a matrix with one row at each mean from mu0 to mu1
# and with a column for each EL return value (except wts)
# There are also two internal functions:
# mllog gives a family of alternatives to -log() and derivative thereof
# in order to attain self-concordance.
# svdlm does least squares regression via the SVD
# These lengthen the emplik function, but making them internal keeps them
# out of the users' name spaces
# Art B. Owen, Feb 2014
emplik = function( z, # matrix with one data vector per row, a column vector is ok when d=1
mu, # hypothesized mean, default (0 ... 0) in R^d
lam, # starting lambda, default (0 ... 0)
eps, # lower cutoff for -log( ), default 1/nrow(z)
M, # upper cutoff for -log( ), default Inf
thresh=1e-30, # convergence threshold for log likelihood (default is aggressive)
itermax=100, # upper bound on number of Newton steps (seems ample)
verbose=FALSE) # controls printed output
{
# empirical likelihood test for whether
# mean of (rows of) z is mu
# Internal function mllog, modified -log( ) with derivatives
mllog = function( x, eps, M, der=0 ){
# minus log and its first der derivatives, on eps < x < M
# 4th order Taylor approx to left of eps and right of M
# der = 0 or 1 or 2
# 4th order is lowest that gives self concordance
if( missing(M) )
M = Inf
if( eps>M )
stop("Thresholds out of order")
lo = x < eps
hi = x > M
md = (!lo) & (!hi)
# Coefficients for 4th order Taylor approx below eps
coefs = rep(0,5)
coefs[1] = -log(eps)
coefs[2:5] = (-eps)^-(1:4)/(1:4)
# Coefficients for 4th order Taylor approx above M
Coefs = rep(0,5)
Coefs[1] = -log(M)
Coefs[2:5] = (-M)^-(1:4)/(1:4)
# degree 4 polynomial approx to log
h = function(y,cvals){ # cvals are coefs at eps, Coefs at M
# sum c[t+1] y^t
tee = 1:4
ans = y*0
ans = ans + cvals[1]
for( j in tee )
ans = ans + y^j*cvals[j+1]
ans
}
# first derivative of h at y, from approx at pt
hp = function(y,pt){
tee = 0:3
ans = y*0
for( j in tee )
ans = ans + (-y/pt)^j
ans = ans * (-pt)^-1
ans
}
# second derivative of h at y, from approx at pt
hpp = function(y,pt){
tee = 0:2
ans = y*0
for( j in tee )
ans = ans + (j+1) * (-y/pt)^j
ans = ans *(-pt)^-2
ans
}
# function value
f = x*0
f[lo] = h( x[lo]-eps, coefs )
f[hi] = h( x[hi]-M, Coefs )
f[md] = -log(x[md])
if( der<1 )return(cbind(f))
# first derivative
fp = x*0
fp[lo] = hp( x[lo]-eps, eps )
fp[hi] = hp( x[hi]-M, M )
fp[md] = -1/x[md]
if( der<2 )return(cbind(f,fp))
# second derivative
fpp = x*0
fpp[lo] = hpp( x[lo]-eps, eps )
fpp[hi] = hpp( x[hi]-M, M )
fpp[md] = 1/x[md]^2
return( cbind(f,fp,fpp) )
# End of mllog()
}
# Internal function to do a linear model via SVD
# Empirical likelihood's Newton steps are of
# least squares type.
svdlm = function(X,y){
# Linear model regression coefficient via SVD
# Tolerances for generalized inverse via SVD
RELTOL = 1e-9
ABSTOL = 1e-100
# Get Xplus = generalized inverse of X
# If svd algorithm failures are encountered
# it sometimes helps to try svd(t(X)) and
# translate back. First check to ensure that
# X does not contain NaN or Inf or -Inf.
svdX = svd(X)
d = svdX$d
lo = d < (RELTOL * max(d) + ABSTOL)
dinv = 1/d
dinv[lo] = 0
Xplus = svdX$v %*% diag(dinv,nrow=length(dinv)) %*% t(svdX$u)
# taking care with diag when dinv is 1x1
# to avoid getting the identity matrix of
# size floor(dinv)
# Return X^+ y
Xplus %*% matrix(y,ncol=1)
}
# end of svdlm
# Backtracking line search parameters [Tweak only with extreme caution.]
# See Boyd and Vandenberghe, pp464-466.
ALPHA = 0.3 # seems better than 0.01 on some 2d test data (sometimes fewer iters)
BETA = 0.8
# We need 0 < ALPHA < 1/2 and 0 < BETA < 1
# Backtrack threshold: you can miss by this much.
BACKEPS = 0
# Consider replacing 0 by 1e-10 if backtracking seems to be
# failing due to round off.
if( is.vector(z) )
z = matrix(z,ncol=1)
n = nrow(z)
d = ncol(z)
if( missing(mu) )
mu = rep(0,d)
z = t( t(z)-mu ) # subtract mu from each z[i,]
if( missing(eps) )eps = 1/n
if( missing(M) )M = Inf
#
# Use lam = 0 or initial lam, whichever is best
#
init0 = mllog( rep(1,n), eps=eps, M=M, der=2 ) # i.e. lam = 0
if( missing(lam) ){
init = init0
lam = rep(0,d)
}else{
init = mllog( 1+z%*%lam, eps=eps, M=M, der=2 )
if( sum(init0[,1]) < sum(init[,1]) ){
lam = rep(0,d)
init = init0
}
}
# Initial f, g
fold = sum(init[,1])
gold = apply( z * init[,2],2,sum )
converged = FALSE
iter = 0
oldvals = init
while( !converged ){
iter = iter + 1
# Get Newton Step
rootllpp = sqrt(oldvals[,3]) # sqrt 2nd deriv of -llog lik
zt = z
for( j in 1:d )
zt[,j] = zt[,j] * rootllpp
yt = oldvals[,2] / rootllpp
step = -svdlm(zt,yt) # more reliable than step = -lm( yt~zt-1 )$coef
backtrack = FALSE
s = 1 # usually called t, but R uses t for transpose
while( !backtrack ){
newvals = mllog( 1+z%*%(lam+s*step),eps=eps,M=M,der=2 )
fnew = sum(newvals[,1])
targ = fold + ALPHA * s * sum( gold*step ) + BACKEPS # (BACKEPS for roundoff, should not be needed)
if( fnew <= targ ){
# backtracking has converged
backtrack = TRUE
oldvals = newvals
fold = fnew
gold = apply( z * oldvals[,2],2,sum )
# take the step
lam = lam + s*step
}else{
s = s * BETA
}
}
# Newton decrement and gradient norm
ndec = sqrt( sum( (step*gold)^2 ) )
gradnorm = sqrt( sum(gold^2))
if(verbose)print(c(fold,gradnorm,ndec,lam))
converged = ( ndec^2 <= thresh)
if( iter > itermax )break
}
wts = (1/n)/(1+z%*%lam)
logelr = sum( mllog(1+z%*%lam,eps=eps,M=M,der=0) )
list(logelr=logelr,lam=lam, wts=wts,
converged=converged,iter=iter,ndec=ndec,gradnorm=gradnorm)
}
tracelr = function( z, mu0, mu1, N, verbose=FALSE, ... ){
# compute elr from mu0 to mu1 in N+1 steps
# ... arguments are passed to emplik
#
if( is.vector(z) )z = matrix(z,ncol=1)
d = ncol(z)
ans = matrix(0,N+1,d+1+d+4)
colnames(ans)=rep("",2*d+5)
for( j in 1:d )
colnames(ans)[j] = paste("z",j,sep=".")
colnames(ans)[d+1] = "logelr"
for( j in 1:d )
colnames(ans)[d+1+j] = paste("lambda",j,sep=".")
colnames(ans)[d+1+d+1] = "conv"
colnames(ans)[d+1+d+2] = "iter"
colnames(ans)[d+1+d+3] = "decr"
colnames(ans)[d+1+d+4] = "gnorm"
lam = rep(0,d)
for( i in 0:N ){
mui = (i*mu1+(N-i)*mu0)/N
ansi = emplik(z,mui,lam,...)
ans[i+1,] = c(mui,ansi$logelr,ansi$lam,ansi$converged,ansi$iter,ansi$ndec,ansi$gradnorm)
if( verbose )print(c(i,ansi$iter))
}
ans
}