まぃふぇいばりっと

機械学習やってます.Julia大好きです.勉強したことの殴り書きです.

Julia言語 Nyström method による高速なカーネル回帰

カーネル回帰においてデータ数が多くてカーネル行列が巨大になると,逆行列の計算がボトルネックになる.そういう時は random feature map を用いて前処理行列を構成して線型方程式を反復法で高速に解く処方箋(この文献のアルゴリズム1参照)もあるみたい.ここでは,カーネル行列を Nyström 法で低ランク近似して,Sherman-Morrison-Woodbury 公式で効率的に逆行列を近似する方法を実装した.参考にしたのは以下の文献.

・以下文献の式(36)-(38) proceedings.mlr.press

・以下文献の式(5)

ojs.aaai.org

とりあえずカーネル行列を使えるようにkernes.jlと名付けられた次のファイルを同一ディレクトリにいれておく.

function kernel_function(kernel)
    if kernel == "rbf"
        return (x,y,c,deg) -> exp( -c*norm(x.-y)^2 )
    elseif kernel == "lapras"
        return (x,y,c,deg) -> exp(-c*sum( abs.(x .- y)))
    elseif kernel == "poly"
        return (x,y,c,deg) -> (c+x'*y)^deg
    elseif kernel == "linear"
        return (x,y,c,deg) -> (0+x'*y)^1
    elseif kernel == "periodic"
        return (x,y,c,deg) -> exp(deg* cos(c * norm(x .- y)))
    else
        error("Kernel $kernel is not defined")
    end
end

function Kernel(X::Matrix, Y::Matrix; c=1.0, deg=3.0, kernel="rbf")
    kf = kernel_function(kernel)
    d, n = size(X)
    d, m = size(Y)
    K = Matrix{Float64}(undef, m, n)
    for j = 1:n
        for i = 1:m
           K[i,j] = kf(Y[:,i],X[:,j],c,deg)
        end
    end
    return K
end

function Kernel(X::Matrix; c=1.0, deg=3.0, kernel="rbf")
    kf = kernel_function(kernel)
    d, n = size(X)
    K = Matrix{Float64}(undef, n, n)
    for j = 1:n
        for i = 1:j
           K[i,j] = kf(X[:,i],X[:,j],c,deg)
        end
    end
    return Symmetric(K)
end

次に本体とnystrom法.

using LinearAlgebra
include("kernels.jl")

"""
low-rank approximation based on random sampling.
See the following reference,
[Petros Drineas et al.](https://www.jmlr.org/papers/v6/drineas05a.html)
# Aruguments
- 'K' : input positive definite matrix.
- 'p' : target rank

example:
    A = randn(100,100)
    K = A*A'
    for p in [1,5,10,20,30,50,100]
        Kr = nystrom(K, p)
        @show p, norm(K-Kr)
    end
"""
function nystrom(K,p)
    n, n = size(K)
    idxs = sample(1:n, p, replace=false, ordered=false)
    C = K[:,idxs]
    W = K[idxs,idxs]
    invWct = W \ C'
    return C, invWct
end

function Kreg(X_train, Y_train, X_test;lambda=1, c=1, deg=3.0, kernel="rbf", sampling=false, p=10)
    K_train_train = Kernel(X_train, c=c, deg=deg, kernel=kernel)
    k_train_test = Kernel(X_train, X_test, c=c, deg=deg, kernel=kernel)

    # number of test data
    M = size(X_test)[2]
    # number of target variables
    N = size(Y_train)[1]

    Y_pred = zeros(N,M)
    for n = 1:N
        if sampling
            C, invWCt = nystrom(K_train_train, p)
            alpha = 1.0 / lambda .* ( I - C* ((lambda*I + invWCt*C) \ invWCt) ) * Y_train[n,:]
        else
            alpha = (K_train_train + lambda * I) \ Y_train[n,:]
        end
        Y_pred[n,:] .= k_train_test * alpha
    end
    return Y_pred
end


main()

適当な3次元ベクトル場を学習してみる.とりあえずデータ数13000でやってみよう.

using Distributions
using BenchmarkTools

function make_vector_field(L)
    f(x,y,z) = [x+y+z, x*cos(y)-y, -y+x+z]
    dist = Uniform(-2, 2)
    x = rand(dist,(3, L))
    y = Matrix{Float64}(undef, 3, L)
    for i = 1:L
        y[:,i] = f(x[:,i]...)
    end
    return x, y
end

function eval(y_test, y_pred)
    L = length(y_test)
    cost = norm(y_test[:] .- y_pred[:])^2 / L
    return cost
end

function main()
    x_train, y_train = make_vector_field(13000)
    x_test, y_test = make_vector_field(100)

    y_pred_0 = Kreg(x_train, y_train, x_test, kernel="rbf" ,lambda=1, c=1, sampling=false)
    @show norm(y_pred_0 - y_test)
    @btime Kreg($x_train, $y_train, $x_test, kernel="rbf" ,lambda=1, c=1, sampling=false)
    println("")
    ps = [5,10,20,30,40,50,100]
    for p in ps
        y_pred = Kreg(x_train, y_train, x_test, kernel="rbf" ,lambda=1, c=1, sampling=true, p=p)
        @show p, norm(y_pred - y_test)
        @btime Kreg($x_train, $y_train, $x_test, kernel="rbf" ,lambda=1, c=1, sampling=true, p=$p)
        println("")
    end
end

nystrom法を使わないと56秒. nystrom法を使うとpが十分に小さければ,37秒.そんなに大差ないですね.誤差もでかいし.

norm(y_pred_0 - y_test) = 0.6526581055689884
  56.314 s (429032544 allocations: 38.25 GiB)

(p, norm(y_pred - y_test)) = (5, 12984.632037633266)
  37.790 s (429032592 allocations: 42.01 GiB)

(p, norm(y_pred - y_test)) = (10, 9540.59736297309)
  37.849 s (429032592 allocations: 42.02 GiB)

(p, norm(y_pred - y_test)) = (20, 4353.575356488931)
  37.820 s (429032601 allocations: 42.03 GiB)

(p, norm(y_pred - y_test)) = (30, 2527.110361708636)
  37.764 s (429032601 allocations: 42.04 GiB)

(p, norm(y_pred - y_test)) = (40, 1576.9397341759382)
  37.998 s (429032601 allocations: 42.04 GiB)

(p, norm(y_pred - y_test)) = (50, 1096.2351214109729)
  37.746 s (429032616 allocations: 42.05 GiB)

(p, norm(y_pred - y_test)) = (100, 270.4954692388678)
  38.273 s (429032616 allocations: 42.10 GiB)

試しにデータ数を130でpを1から130まで動かしてみる.

norm(y_pred_0 - y_test) = 12.090595437011599
  5.766 ms (107610 allocations: 8.76 MiB)

(p, norm(y_pred - y_test)) = (5, 100.97716320731487)
  5.538 ms (107652 allocations: 9.00 MiB)

(p, norm(y_pred - y_test)) = (10, 55.38732928606096)
  5.636 ms (107643 allocations: 9.06 MiB)

(p, norm(y_pred - y_test)) = (20, 35.78078677983344)
  5.855 ms (107652 allocations: 9.18 MiB)

(p, norm(y_pred - y_test)) = (30, 17.860651613527118)
  6.004 ms (107652 allocations: 9.33 MiB)

(p, norm(y_pred - y_test)) = (40, 12.800274289390913)
  6.218 ms (107652 allocations: 9.50 MiB)

(p, norm(y_pred - y_test)) = (50, 10.539272651439447)
  6.371 ms (107667 allocations: 9.69 MiB)

(p, norm(y_pred - y_test)) = (100, 11.428565046455178)
  7.411 ms (107667 allocations: 11.00 MiB)

(p, norm(y_pred - y_test)) = (130, 12.090595437011602)
  8.118 ms (107667 allocations: 12.06 MiB)

うーん,やっぱり微妙.target rank のpがかなり小さくないと高速化の効果はでなそう.ボトルネックになっているのは, alpha = 1.0 / lambda .* ( I - C* ((lambda*I + invWCt*C) \ invWCt) ) * Y_train[n,:] の計算っぽい. で,pが小さいと近似精度はひくすぎる.

いまはランダムサンプリングで,nystrom法を使っているけど,もっと効果的なsamplingの方法があるので,そちらを実装する必要があるかもしれない.