Skip to content

Commit

Permalink
ENH: Add hypothesis tests for CCA (#214)
Browse files Browse the repository at this point in the history
 ---------

Co-authored-by: Alex Arslan <[email protected]>
  • Loading branch information
kshedden and ararslan committed Jun 5, 2024
1 parent c3067e2 commit 1e2c359
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 12 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ version = "0.10.2"

[deps]
Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand Down
11 changes: 8 additions & 3 deletions src/MultivariateStats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ module MultivariateStats
using LinearAlgebra
using SparseArrays
using Statistics: middle
using StatsAPI: RegressionModel
using Distributions: cdf, FDist
using StatsAPI: RegressionModel, HypothesisTest
using StatsBase: SimpleCovariance, CovarianceEstimator, AbstractDataTransform,
ConvergenceException, pairwise, pairwise!, CoefTable

import Statistics: mean, var, cov, covm, cor
import Base: length, size, show
import StatsAPI: fit, predict, coef, weights, dof, r2
import StatsAPI: fit, predict, coef, weights, dof, r2, pvalue
import LinearAlgebra: eigvals, eigvecs

export
Expand All @@ -28,6 +29,7 @@ module MultivariateStats
eigvecs, # eignenvectors of the transformation
loadings, # model loadings
var, # model variance
pvalue, # p-values for hypothesis tests

# lreg
llsq, # Linear Least Square regression
Expand Down Expand Up @@ -65,7 +67,10 @@ module MultivariateStats
KernelPCA, # Type: the Kernel PCA model

## cca
CCA, # Type: Correlation Component Analysis model
CCA, # Type: Correlation Component Analysis model
WilksLambdaTest, # Wilks lambda statistics and tests
PillaiTraceTest, # Pillai trace statistics and tests
LawleyHotellingTest, # Lawley-Hotelling statistics and tests

ccacov, # CCA based on covariances
ccasvd, # CCA based on singular value decomposition of input data
Expand Down
125 changes: 119 additions & 6 deletions src/cca.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@ struct CCA{T<:Real} <: RegressionModel
xproj::Matrix{T} # projection matrix for X, of size (dx, p)
yproj::Matrix{T} # projection matrix for Y, of size (dy, p)
corrs::Vector{T} # correlations, of length p
eigs::Vector{T} # eigenvalues
nobs::Int64 # number of observations

function CCA(xm::Vector{T},
ym::Vector{T},
xp::Matrix{T},
yp::Matrix{T},
crs::Vector{T}) where T<:Real
crs::Vector{T},
eigs::Vector{T},
nobs::Int) where T<:Real

dx, px = size(xp)
dy, py = size(yp)
Expand All @@ -32,7 +36,7 @@ struct CCA{T<:Real} <: RegressionModel
length(crs) == px ||
throw(DimensionMismatch("Incorrect length of corrs."))

new{T}(xm, ym, xp, yp, crs)
new{T}(xm, ym, xp, yp, crs, eigs, nobs)
end
end

Expand Down Expand Up @@ -177,7 +181,7 @@ function _ccacov(Cxx, Cyy, Cxy, xmean, ymean, p::Int)
G = cholesky(Cyy) \ Cxy'
Ex = eigen(Symmetric(Cxy * G), Symmetric(Cxx))
ord = sortperm(Ex.values; rev=true)
vx, Px = extract_kv(Ex, ord, p)
eigs, Px = extract_kv(Ex, ord, p)
Py = qnormalize!(G * Px, Cyy)
else
# solve Py: (Cyx * inv(Cxx) * Cxy) Py = λ Cyy Py
Expand All @@ -186,7 +190,7 @@ function _ccacov(Cxx, Cyy, Cxy, xmean, ymean, p::Int)
H = cholesky(Cxx) \ Cxy
Ey = eigen(Symmetric(Cxy'H), Symmetric(Cyy))
ord = sortperm(Ey.values; rev=true)
vy, Py = extract_kv(Ey, ord, p)
eigs, Py = extract_kv(Ey, ord, p)
Px = qnormalize!(H * Py, Cxx)
end

Expand All @@ -196,7 +200,7 @@ function _ccacov(Cxx, Cyy, Cxy, xmean, ymean, p::Int)
crs = coldot(Px, Cxy * Py)

# construct CCA model
CCA(xmean, ymean, Px, Py, crs)
CCA(xmean, ymean, Px, Py, crs, sqrt.(eigs), -1)
end

"""
Expand Down Expand Up @@ -275,7 +279,7 @@ function _ccasvd(Zx::DenseMatrix{T}, Zy::DenseMatrix{T}, xmean::Vector{T}, ymean
crs = rmul!(coldot(Zx'Px, Zy'Py), one(T)/(n-1))

# construct CCA model
CCA(xmean, ymean, Px, Py, crs)
CCA(xmean, ymean, Px, Py, crs, S.S[si], n)
end

## interface functions
Expand Down Expand Up @@ -336,3 +340,112 @@ function fit(::Type{CCA}, X::AbstractMatrix{T}, Y::AbstractMatrix{T};

return M::CCA
end

abstract type MultivariateTest <: HypothesisTest end

struct WilksLambdaTest <: MultivariateTest
stat::Float64
fstat::Float64
df1::Float64
df2::Float64
end

struct LawleyHotellingTest <: MultivariateTest
stat::Float64
fstat::Float64
df1::Float64
df2::Float64
end

struct PillaiTraceTest <: MultivariateTest
stat::Float64
fstat::Float64
df1::Float64
df2::Float64
end

function pvalue(ct::MultivariateTest)
return ccdf(FDist(ct.df1, ct.df2), ct.fstat)
end

function dof(ct::MultivariateTest)
return (ct.df1, ct.df2)
end

function _testprep(cca::CCA, n, k)

r = cca.eigs[k:end]
dx = length(cca.xmean)
dy = length(cca.ymean)
if isnothing(n) && cca.nobs == -1
throw(ArgumentError("If CCA was fit using :cov, n must be provided to tests"))
end
if n != -1 && cca.nobs != -1 && cca.nobs != n
throw("Provided n is different from actual n")
end
n = n == -1 ? cca.nobs : n

p = dx - k + 1
q = dy - k + 1
n = n - k + 1

m = (abs(p - q) - 1) / 2
N = (n - p - q - 2) / 2
s = min(p, q)

return r, s, m, N, n, dx, dy, p, q
end

"""
WilksLambdaTest(cca; n=-1, k=1)
Use Wilks Lambda to test the dimension of a CCA. The null hypothesis of
the test is that canonical correlations k, k+1, ... are zero. If the
CCA was fit with a covariance matrix then the sample size n must be provided.
"""
function WilksLambdaTest(cca::CCA; n=-1, k=1)

# Reference: Rencher and Christensen (2012)

r, s, m, N, n, dx, dy, p, q = _testprep(cca, n, k)
stat = prod(1 .- r.^2)
w = n - (p + q + 3) / 2
t = p*q == 2 ? 1.0 : sqrt((p^2*q^2 - 4) / (p^2 + q^2 - 5))
df1 = p*q
df2 = w*t - p*q/2 + 1
fstat = ((1 - stat^(1/t)) / stat^(1/t)) * (df2 / df1)
return WilksLambdaTest(stat, fstat, df1, df2)
end

"""
PillaiTraceTest(cca; n=-1, k=1)
Use Pillai's trace to test the dimension of a CCA. The null hypothesis of
the test is that canonical correlations k, k+1, ... are zero. If the
CCA was fit with a covariance matrix then the sample size n must be provided.
"""
function PillaiTraceTest(cca::CCA; n=-1, k=1)
r, s, m, N, n, dx, dy, p, q = _testprep(cca, n, k)
stat = sum(abs2, r)
fstat = (2*N + s + 1)*stat / ((2*m + s + 1) * (s - stat))
df1 = s*(2*m + s + 1)
df2 = s*(2*N + s + 1)
return PillaiTraceTest(stat, fstat, df1, df2)
end

"""
LawleyHotellingTest(cca; n=-1, k=1)
Use the Lawley Hotelling statistics to test the dimension of a CCA. The
null hypothesis of the test is that canonical correlations k, k+1, ... are
zero. If the CCA was fit with a covariance matrix then the sample size n
must be provided.
"""
function LawleyHotellingTest(cca::CCA; n=-1, k=1)
r, s, m, N, n, dx, dy, p, q = _testprep(cca, n, k)
stat = sum(r.^2 ./ (1 .- r.^2))
fstat = 2*(s*N + 1) * stat / (s^2 * (2*m + s + 1))
df1 = s*(2*m + s + 1)
df2 = 2*(s*N + 1)
return LawleyHotellingTest(stat, fstat, df1, df2)
end
29 changes: 26 additions & 3 deletions test/cca.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ using Statistics: mean, cov, cor
Px = qr(randn(rng, dx, dx)).Q[:, 1:p]
Py = qr(randn(rng, dy, dy)).Q[:, 1:p]

M = CCA(Float64[], Float64[], Px, Py, [0.8, 0.6, 0.4])
M = CCA(Float64[], Float64[], Px, Py, [0.8, 0.6, 0.4], zeros(3), -1)

@test size(M)[1] == dx
@test size(M)[2] == dy
Expand All @@ -40,7 +40,7 @@ using Statistics: mean, cov, cor
ux = randn(rng, dx)
uy = randn(rng, dy)

M = CCA(ux, uy, Px, Py, [0.8, 0.6, 0.4])
M = CCA(ux, uy, Px, Py, [0.8, 0.6, 0.4], zeros(3), -1)

@test size(M)[1] == dx
@test size(M)[2] == dy
Expand Down Expand Up @@ -145,12 +145,35 @@ using Statistics: mean, cov, cor
predict(M, YY, :y)
predict(MM, XX, :x)
predict(MM, YY, :y)

# type stability
for func in (M->mean(M, :x), M->mean(M, :y),
M->projection(M, :x),
M->projection(M, :y), cor)
@test eltype(func(M)) == Float64
@test eltype(func(MM)) == Float32
end

M1 = fit(CCA, X, Y; method=:svd, outdim=5)
M2 = fit(CCA, X, Y; method=:cov, outdim=5)

# Compare hypothesis tests with Stata
stats = (WilksLambdaTest=0.000384245, PillaiTraceTest=2.81275, LawleyHotellingTest=55.1432)
df1 = (WilksLambdaTest=30, PillaiTraceTest=30, LawleyHotellingTest=30)
df2 = (WilksLambdaTest=3958, PillaiTraceTest=4965, LawleyHotellingTest=4937)
fstats = (WilksLambdaTest=810.3954, PillaiTraceTest=212.8296, LawleyHotellingTest=1814.9480)

for f in [WilksLambdaTest, PillaiTraceTest, LawleyHotellingTest]

ct1 = f(M1)
ct2 = f(M2; n=size(X, 2))
s = Symbol(f)

for ct in [ct1, ct2]
@test isapprox(ct.stat, stats[s], atol=1e-5, rtol=1e-5)
@test isapprox(ct.fstat, fstats[s], atol=1e-5, rtol=1e-5)
@test isapprox(ct.df1, df1[s], atol=1e-5, rtol=1e-5)
@test isapprox(ct.df2, df2[s], atol=1e-5, rtol=1e-5)
end
end
end

0 comments on commit 1e2c359

Please sign in to comment.