如何解决执行许多模拟以避免循环
我正在尝试模拟拟合值以更改数据的不同子组的模型,这些值再次基于对原始数据帧的另一个子集的随机采样(我为这个问题组成的最小示例忽略了随机采样等,则所有模拟的拟合值都相同,但这无关紧要)。我编写了一个dplyr代码,用于存储每个组的模型,生成用于预测拟合值的新x值,预测它们等,等等。它生成一列拟合值,正是我想要的。但是,我想完成整个过程1000倍。我当然可以使用for循环(如下面的示例中所示)执行此操作,但是是否有可能在dplyr-pipe行中执行此操作?也许可以加快整个过程的速度(我的原始数据集很大,for循环要花很多时间)?
# making up data
dat <- data.frame("species" = seq(1:20),"col_A" = runif(20,min=1000,max=2500),"col_B" = runif(20,min = 0,max = 1500),"maximum" = rep(2500,20),"minimum" = rep(1000,groups = rep(LETTERS[1:5],each = 4))
# functions to use with purrr
linear_mod <- function(dat) {
lm(col_A ~ col_B,data = dat)
}
# define parametres and an empty data frame to use in the for loop
runs <- 10
fitted_sim <- data.frame(matrix(data=NA,nrow=20,ncol=runs+1,byrow=FALSE)) #empty dataframe to contain fitted values for each alt
names(fitted_sim) <- as.factor(seq(1:runs+1))
# for-loop around my dplyr-code
for (j in 1:runs){
simul <- dat %>%
group_by(groups) %>%
nest(data = c(col_A,col_B,species)) %>%
mutate(model = map(data,linear_mod),# add model for every group
sim_data = list(seq(minimum,maximum,by = 10))) %>% # define new x-values for later predictions
unnest(sim_data) %>%
nest(sim_data = sim_data) %>%
mutate(fitted = map2(model,sim_data,~predict(.x,col_B = sim_data,type = "response")),# predict values
unnest(fitted) # unnest predicted values to save in data frame
# save newly fitted values in fitted_sim data frame
fitted_sim[,1] <- simul$groups
fitted_sim[,1+j] <- simul$fitted
}
感谢每一个提示!
编辑: 这是一个扩展的示例代码,包括上述随机采样,但在我的第一个示例中省略了:
# for-loop around my dplyr-code
for (j in 1:runs) {
simul <- dat %>%
group_by(groups) %>%
rowwise() %>%
mutate(vector_column = case_when(abs(col_A) == (maximum-minimum) ~ list(col_B),sign(col_A) == 1 ~ list(dat$col_B[dat$col_B <= maximum - col_A]),# using list function to store vectors in a data.frame
sign(col_A) != 1 ~ list(dat$col_B[dat$col_B >= minimum + col_A])),helper = !is_empty(vector_column),# in case some of the vectors are empty so it is not possible to use sample
col_B_new = ifelse(helper,sample(vector_column,1),NA),helper = NULL,sim_data = list(seq(minimum,by = 10))) %>% # define new x-values for later predictions
ungroup() %>% # get rid of rowwise()
group_by(groups) %>%
unnest(sim_data) %>%
nest(sim_data = sim_data) %>%
nest(data = c(col_A,col_B_new,fitted = map2(model,col_B_new = sim_data,type = "response"))) %>% # predict values
unnest(fitted) # unnest predicted values to save in data frame
# save newly fitted values in fitted_sim data frame
fitted_sim[,1] <- simul$groups
fitted_sim[,1+j] <- simul$fitted
}
解决方法
不确定从simul
数据框架中到底需要什么,但是各个循环元素似乎彼此独立,因此您可以使用parallel:mclappy
在其中运行它们平行。在我的示例中,我只使用lapply
,但这已经快了很多-如果可以以任何方式帮助您...
library(tidyverse)
library(data.table)
library(microbenchmark)
# making up data
dat <- data.frame("species" = seq(1:20),"col_A" = runif(20,min=1000,max=2500),"col_B" = runif(20,min = 0,max = 1500),"maximum" = rep(2500,20),"minimum" = rep(1000,groups = rep(LETTERS[1:5],each = 4))
# functions to use with purrr
linear_mod <- function(dat) {
lm(col_A ~ col_B,data = dat)
}
# define parametres and an empty data frame to use in the for loop
runs <- 10
fitted_sim <- data.frame(matrix(data=NA,nrow=20,ncol=runs+1,byrow=FALSE)) #empty dataframe to contain fitted values for each alt
names(fitted_sim) <- as.factor(seq(1:runs+1))
# I just wrapped your code in a function for benchmarking
run.simul <- function(){
for (j in 1:runs){
simul <- dat %>%
group_by(groups) %>%
nest(data = c(col_A,col_B,species)) %>%
mutate(model = map(data,linear_mod),# add model for every group
sim_data = list(seq(minimum,maximum,by = 10))) %>% # define new x-values for later predictions
unnest(sim_data) %>%
nest(sim_data = sim_data) %>%
mutate(fitted = map2(model,sim_data,~predict(.x,col_B = sim_data,type = "response"))) %>% # predict values
unnest(fitted) # unnest predicted values to save in data frame
# save newly fitted values in fitted_sim data frame
fitted_sim[,1] <- simul$groups
fitted_sim[,1+j] <- simul$fitted
}
fitted_sim
}
# my version (sorry for using `data.table`...)
Dat <- data.table(dat,key="groups")
getFits <- function(x){
x[,fitted := predict(lm(col_A ~ col_B),.(list(seq(minimum[1],maximum[1],by = 10)))),by = groups]
x[,.(species,groups,fitted)]
}
# get the results into a data.frame; use mclapply instead of lapply if you like
dcast(rbindlist(lapply(seq_len(runs),function(z) getFits(data.table(dat,key="groups"))),idcol = "run"),... ~ run,value.var = "fitted")
#> species groups 1 2 3 4 5 6
#> 1: 1 A 1767.621 1767.621 1767.621 1767.621 1767.621 1767.621
#> 2: 2 A 1376.679 1376.679 1376.679 1376.679 1376.679 1376.679
#> 3: 3 A 1523.100 1523.100 1523.100 1523.100 1523.100 1523.100
#> 4: 4 A 1794.593 1794.593 1794.593 1794.593 1794.593 1794.593
#> 5: 5 B 1272.408 1272.408 1272.408 1272.408 1272.408 1272.408
#> 6: 6 B 1967.709 1967.709 1967.709 1967.709 1967.709 1967.709
#> 7: 7 B 1792.934 1792.934 1792.934 1792.934 1792.934 1792.934
#> 8: 8 B 2318.699 2318.699 2318.699 2318.699 2318.699 2318.699
#> 9: 9 C 2017.187 2017.187 2017.187 2017.187 2017.187 2017.187
#> 10: 10 C 1899.827 1899.827 1899.827 1899.827 1899.827 1899.827
#> 11: 11 C 1734.953 1734.953 1734.953 1734.953 1734.953 1734.953
#> 12: 12 C 1834.046 1834.046 1834.046 1834.046 1834.046 1834.046
#> 13: 13 D 1797.615 1797.615 1797.615 1797.615 1797.615 1797.615
#> 14: 14 D 1915.832 1915.832 1915.832 1915.832 1915.832 1915.832
#> 15: 15 D 1841.489 1841.489 1841.489 1841.489 1841.489 1841.489
#> 16: 16 D 1798.442 1798.442 1798.442 1798.442 1798.442 1798.442
#> 17: 17 E 1641.359 1641.359 1641.359 1641.359 1641.359 1641.359
#> 18: 18 E 1631.253 1631.253 1631.253 1631.253 1631.253 1631.253
#> 19: 19 E 1634.197 1634.197 1634.197 1634.197 1634.197 1634.197
#> 20: 20 E 1634.991 1634.991 1634.991 1634.991 1634.991 1634.991
#> 7 8 9 10
#> 1: 1767.621 1767.621 1767.621 1767.621
#> 2: 1376.679 1376.679 1376.679 1376.679
#> 3: 1523.100 1523.100 1523.100 1523.100
#> 4: 1794.593 1794.593 1794.593 1794.593
#> 5: 1272.408 1272.408 1272.408 1272.408
#> 6: 1967.709 1967.709 1967.709 1967.709
#> 7: 1792.934 1792.934 1792.934 1792.934
#> 8: 2318.699 2318.699 2318.699 2318.699
#> 9: 2017.187 2017.187 2017.187 2017.187
#> 10: 1899.827 1899.827 1899.827 1899.827
#> 11: 1734.953 1734.953 1734.953 1734.953
#> 12: 1834.046 1834.046 1834.046 1834.046
#> 13: 1797.615 1797.615 1797.615 1797.615
#> 14: 1915.832 1915.832 1915.832 1915.832
#> 15: 1841.489 1841.489 1841.489 1841.489
#> 16: 1798.442 1798.442 1798.442 1798.442
#> 17: 1641.359 1641.359 1641.359 1641.359
#> 18: 1631.253 1631.253 1631.253 1631.253
#> 19: 1634.197 1634.197 1634.197 1634.197
#> 20: 1634.991 1634.991 1634.991 1634.991
run.simul()
#> 1 2 3 4 5 6 7 8 9
#> 1 A 1767.621 1767.621 1767.621 1767.621 1767.621 1767.621 1767.621 1767.621
#> 2 A 1376.679 1376.679 1376.679 1376.679 1376.679 1376.679 1376.679 1376.679
#> 3 A 1523.100 1523.100 1523.100 1523.100 1523.100 1523.100 1523.100 1523.100
#> 4 A 1794.593 1794.593 1794.593 1794.593 1794.593 1794.593 1794.593 1794.593
#> 5 B 1272.408 1272.408 1272.408 1272.408 1272.408 1272.408 1272.408 1272.408
#> 6 B 1967.709 1967.709 1967.709 1967.709 1967.709 1967.709 1967.709 1967.709
#> 7 B 1792.934 1792.934 1792.934 1792.934 1792.934 1792.934 1792.934 1792.934
#> 8 B 2318.699 2318.699 2318.699 2318.699 2318.699 2318.699 2318.699 2318.699
#> 9 C 2017.187 2017.187 2017.187 2017.187 2017.187 2017.187 2017.187 2017.187
#> 10 C 1899.827 1899.827 1899.827 1899.827 1899.827 1899.827 1899.827 1899.827
#> 11 C 1734.953 1734.953 1734.953 1734.953 1734.953 1734.953 1734.953 1734.953
#> 12 C 1834.046 1834.046 1834.046 1834.046 1834.046 1834.046 1834.046 1834.046
#> 13 D 1797.615 1797.615 1797.615 1797.615 1797.615 1797.615 1797.615 1797.615
#> 14 D 1915.832 1915.832 1915.832 1915.832 1915.832 1915.832 1915.832 1915.832
#> 15 D 1841.489 1841.489 1841.489 1841.489 1841.489 1841.489 1841.489 1841.489
#> 16 D 1798.442 1798.442 1798.442 1798.442 1798.442 1798.442 1798.442 1798.442
#> 17 E 1641.359 1641.359 1641.359 1641.359 1641.359 1641.359 1641.359 1641.359
#> 18 E 1631.253 1631.253 1631.253 1631.253 1631.253 1631.253 1631.253 1631.253
#> 19 E 1634.197 1634.197 1634.197 1634.197 1634.197 1634.197 1634.197 1634.197
#> 20 E 1634.991 1634.991 1634.991 1634.991 1634.991 1634.991 1634.991 1634.991
#> 10 NA
#> 1 1767.621 1767.621
#> 2 1376.679 1376.679
#> 3 1523.100 1523.100
#> 4 1794.593 1794.593
#> 5 1272.408 1272.408
#> 6 1967.709 1967.709
#> 7 1792.934 1792.934
#> 8 2318.699 2318.699
#> 9 2017.187 2017.187
#> 10 1899.827 1899.827
#> 11 1734.953 1734.953
#> 12 1834.046 1834.046
#> 13 1797.615 1797.615
#> 14 1915.832 1915.832
#> 15 1841.489 1841.489
#> 16 1798.442 1798.442
#> 17 1641.359 1641.359
#> 18 1631.253 1631.253
#> 19 1634.197 1634.197
#> 20 1634.991 1634.991
# benchmarking
microbenchmark(
a=dcast(rbindlist(lapply(seq_len(runs),value.var = "fitted"),b=run.simul(),times= 10L
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> a 46.46851 48.65027 51.09618 49.88775 54.7177 56.2888 10 a
#> b 389.76893 410.57588 418.16993 415.74493 424.2042 448.4604 10 b
由reprex package(v0.3.0)于2020-08-13创建
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。