如何解决减少内存分配并提高Julia代码的速度
我正在尝试对所选变量的所有可能组合进行多元回归。我的代码正在运行,但是需要很长时间,并且内存分配似乎很大。有没有一种方法可以对此进行优化:
using DataFrames
using Statistics
using GLM
using Combinatorics
using Distributions
### read the data
reg_dat = DataFrame(
sample = wsample([1,2],[0.8,0.2],1000000),y_var = rand(1000000),ind = rand(1000000),brak = rand(1000000),times = rand(1000000),tiny = rand(1000000),regr = rand(1000000),breaker = 1 .- rand(1000000),x_var = 10*(1 .- rand(1000000)),kink = rand(1000000),h_var = rand(1000000),ind_x = rand(1000000),brak_x = 1 .- rand(1000000),times_x = 1 .- rand(1000000),tiny_x = 1 .- rand(1000000),regr_x = rand(1000000),breaker_x = rand(1000000),t_var = rand(1000000),x_var_x = rand(1000000),kink_x = rand(1000000),units = rand((1,2,3,4,6,7,8,9,15,78,76,10,23,45,54,87,98),trent = 1:1000000,rz = rand(string.(1:25),1000000)
)
# find all possible combinations of the independent variables
function combinat(names::Array{String,1},indices::Array{Int64,N::Int64)::Vector{Array{String,1}}
cols = Vector{String}()
coms = Vector{Array{String}}()
for i in indices
push!(cols,names[i]::String)
end
for j in 1:N
append!(coms,collect(combinations(cols,j))::Array{Array{String,1})
end
return coms
end
combs = combinat(names(reg_dat),[3:20...],length([3:20...]))
### Find all unique units
units = unique(reg_dat.units)
## Convert combinations to regression formulas
function convert_to_formula(vals::Array{Array{String,1})::Array{FormulaTerm{Term,R} where R,1}
forms = Array{FormulaTerm{Term,1}()
for i in vals
new_i = [i;["rz","trent"]]
symbs = Array{Symbol,1}()
for j in new_i
push!(symbs,Symbol(j))
end
push!(forms,Term(:y_var) ~ sum(term.(symbs)))
end
return forms
end
# Run regressions for each combination of units and combs
function logit_run(data::DataFrame,units::Array{Int64,ind_vars::Array{Array{String,1})::Array{Tuple{Int64,String,Float64},1}
forms = convert_to_formula(ind_vars)
out = Array{Tuple{Int64,1}()
for i in units
data_train = data[(data.sample .== 1) .& (data.units .== i),:]
data_test = data[(data.sample .== 2) .& (data.units .== i),:]
for j in forms
try
logit = glm(j,data_train,Binomial(),LogitLink(),contrasts = Dict(:rz => DummyCoding()))
devs = data_test.y_var - predict(logit,data_test)
push!(out,(i,string(j),sqrt(mean(devs .* devs))))
catch
push!(out,NaN))
end
end
end
return out
end
现在,当我跑步时:
@time logit_run(reg_dat,units,combs[1:10])
> 18.413586 seconds (26.24 M allocations: 10.644 GiB,10.50% gc time)
170-element Array{Tuple{Int64,1}:
(1,"y_var ~ ind + rz + trent",0.2880341157225821)
(1,"y_var ~ brak + rz + trent",0.2880411138604235)
(1,"y_var ~ times + rz + trent",0.2880380466963764)
(1,"y_var ~ tiny + rz + trent",0.2880396984065766)
(1,"y_var ~ regr + rz + trent",0.2880343939542883)
(1,"y_var ~ breaker + rz + trent",0.2880393689495619)
(1,"y_var ~ x_var + rz + trent",0.288038968246708)
(1,"y_var ~ kink + rz + trent",0.288043185030096)
(1,"y_var ~ h_var + rz + trent",0.28804607747341865)
(1,"y_var ~ ind_x + rz + trent",0.2880387457490556)
(98,0.29044425999770246)
(98,0.2904354957341227)
(98,0.2904391507891304)
内存分配很高,并且没有我想要的那么快。我将以此运行数百万次迭代,并希望170个元素比此更快。任何帮助表示赞赏。
更新:
使用低级API并不适合我,因此我找到了逻辑回归的python实现,并用Julia编写了
### Adding an intercept of ones
reg_dat[!,:Intercept] = ones(nrow(reg_dat))
### Creating dummies for the rz variable
for i in unique(reg_dat.rz)[2:end]
reg_dat[!,Symbol(i)] = ifelse.(reg_dat[!,:rz] .== i,1,0)
end
# find all possible combinations
function combinat(indices::Array{Int64,N::Int64)::Vector{Array{Int64,1}}
coms = Vector{Array{Int64}}()
for j in 1:N
append!(coms,collect(combinations(indices,j)))
end
return coms
end
combins = combinat([3:20...],length([3:20...]))
unit_s = unique(reg_dat.units)
### Now the logistic regression function from scratch
function log_reod(unit_s::Array{Int64,combos::Array{Array{Int64,cols::Array{String,mydata::DataFrame,iterations::Int64)
out = Array{Tuple{Int64,1}()
Threads.@threads for j in combos::Array{Array{Int64,1}
Threads.@threads for i in unit_s::Array{Int64,1}
try
train = mydata[(mydata.sample .== 1) .& (mydata.units .== i),:]
test = mydata[(mydata.sample .== 2) .& (mydata.units .== i),:]
X = Matrix(train[!,[24; j;[22,25:48...]]])[:,:]
y = train[!,:y_var]
X_test = Matrix(test[!,:]
y_test = test[!,:y_var]
w = zeros(size(X)[2])
y_bar = mean(y)
w_init = log(y_bar/(1-y_bar))
converged = false
nll_sequence = Array{Float64,1}(undef,iterations)
ab = Array{Tuple{Float64,Bool},iterations)
Threads.@threads for i in 1:iterations
h = X*w
p = 1 ./ (1 .+ exp.(-h))
p_adj = p
p_adj[p_adj .== 1.0] .= 0.99999999
nll = -(1 - y'log.(1 .- p_adj)) + y'log.(p_adj)
nll_sequence[i] = nll
if i .> 1
if !converged & (abs(nll_sequence[end]-nll_sequence[end-1]) < 0.000001)
converged = true
else
converged = false
end
else
converged = false
end
s = p .* (1 .- p)
S = Diagonal(s)
arb_small = ones(length(s)) .* 0.000001
arb_small[s .!= 0] = ((y.-p)./s)[s .!= 0]
z = h + arb_small
w = ((inv(X'*S*X)*X')*S)*z
pred = exp.(X_test*w) ./ (1 .+ exp.(X_test*w))
rmse = sqrt(mean((y_test - pred) .* (y_test - pred)))
ab[i] = (rmse,converged)
end
global ab_fin = ab[findfirst(getfield.(ab,2))-1][1]
catch
global ab_fin = NaN
end
push!(out,join(cols[j],","),ab_fin))
end
end
return out
end
这似乎有效。但是,内存分配更糟,并且不如使用GLM(高级)API的实现快。为什么会这样?
@time log_reod(unit_s,combins[1:10],names(reg_dat),reg_dat,15)
171.060764 seconds (5.88 M allocations: 91.736 GiB,12.45% gc time)
170-element Array{Tuple{Int64,1}:
(4,"ind",0.29000435564005556)
(15,0.28887795028663027)
(8,0.286764063267709)
(98,0.28941319680023303)
(76,0.2898017575615879)
(3,0.2910086150956134)
(45,0.288856696122843)
(9,0.287618950665158)
(1,0.2884813713274216)
(2,0.28729283992016885)
(7,0.28700362633959264)
(6,0.2873866890697135)
(10,0.28889513223197577)
⋮
(3,"ind_x",0.29102070956343595)
(45,0.28884184457957934)
(9,0.2876167225404162)
(1,0.2884805471334929)
(2,0.2873338122136366)
(7,0.2870078816051725)
(6,0.28739698518229273)
(10,0.28891515579773225)
(23,0.2898286130508298)
(78,0.2890331733804241)
(54,0.28855221319296853)
(87,0.2875034656519413)
解决方法
实际上,我误导了您-非常抱歉。 GLM.jl不支持视图。如果直接使用矩阵,仍然可以减少时间,但不幸的是,这并没有很多。
数据准备
df = DataFrame(rand(10^6,10))
df.y = sum.(eachrow(df)) .+ 10 .* randn(10^6) .< 5
function some_models(df)
[glm(@formula(y~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10),df,Binomial(),LogitLink()),glm(@formula(y~x1+x2+x3+x4+x5+x6+x7+x8+x9),glm(@formula(y~x1+x2+x3+x4+x5+x6+x7+x8+x10),glm(@formula(y~x1+x2+x3+x4+x5+x6+x7+x9+x10),glm(@formula(y~x1+x2+x3+x4+x5+x6+x8+x9+x10),glm(@formula(y~x1+x2+x3+x4+x5+x7+x8+x9+x10),glm(@formula(y~x1+x2+x3+x4+x6+x7+x8+x9+x10),glm(@formula(y~x1+x2+x3+x5+x6+x7+x8+x9+x10),glm(@formula(y~x1+x2+x4+x5+x6+x7+x8+x9+x10),glm(@formula(y~x1+x3+x4+x5+x6+x7+x8+x9+x10),glm(@formula(y~x2+x3+x4+x5+x6+x7+x8+x9+x10),LogitLink())]
end
function some_models2(df)
X = [ones(nrow(df)) Matrix(df[!,1:end-1])]
y = df.y
[fit(GeneralizedLinearModel,X,y,fit(GeneralizedLinearModel,X[:,Not(11)],Not(10)],Not(9)],Not(8)],Not(7)],Not(6)],Not(5)],Not(4)],Not(3)],Not(2)],Not(1)],LogitLink())
]
end
基准化:
julia> some_models(df); # first run
julia> @time some_models(df); # second run
8.955404 seconds (10.72 k allocations: 4.713 GiB,5.36% gc time)
julia> some_models2(df); # first run
julia> @time some_models2(df); # second run
8.598572 seconds (1.20 k allocations: 3.412 GiB,8.65% gc time)
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。