执行许多模拟以避免循环

如何解决执行许多模拟以避免循环

我正在尝试模拟拟合值以更改数据的不同子组的模型，这些值再次基于对原始数据帧的另一个子集的随机采样（我为这个问题组成的最小示例忽略了随机采样等，则所有模拟的拟合值都相同，但这无关紧要）。我编写了一个dplyr代码，用于存储每个组的模型，生成用于预测拟合值的新x值，预测它们等，等等。它生成一列拟合值，正是我想要的。但是，我想完成整个过程1000倍。我当然可以使用for循环（如下面的示例中所示）执行此操作，但是是否有可能在dplyr-pipe行中执行此操作？也许可以加快整个过程的速度（我的原始数据集很大，for循环要花很多时间）？

# making up data
dat <- data.frame("species" =  seq(1:20),"col_A" = runif(20,min=1000,max=2500),"col_B" = runif(20,min = 0,max = 1500),"maximum" = rep(2500,20),"minimum" = rep(1000,groups = rep(LETTERS[1:5],each = 4))

# functions to use with purrr
linear_mod <- function(dat) {
  lm(col_A ~ col_B,data = dat)
}

# define parametres and an empty data frame to use in the for loop
runs <- 10
fitted_sim <- data.frame(matrix(data=NA,nrow=20,ncol=runs+1,byrow=FALSE)) #empty dataframe to contain fitted values for each alt
names(fitted_sim) <- as.factor(seq(1:runs+1))

# for-loop around my dplyr-code
for (j in 1:runs){
  simul <- dat %>%
    group_by(groups) %>%
    nest(data = c(col_A,col_B,species)) %>%
    mutate(model = map(data,linear_mod),# add model for every group
           sim_data = list(seq(minimum,maximum,by = 10))) %>% # define new x-values for later predictions
    unnest(sim_data) %>%
    nest(sim_data = sim_data) %>%
    mutate(fitted = map2(model,sim_data,~predict(.x,col_B = sim_data,type = "response")),# predict values
    unnest(fitted) # unnest predicted values to save in data frame
  
  # save newly fitted values in fitted_sim data frame
  fitted_sim[,1] <- simul$groups
  fitted_sim[,1+j] <- simul$fitted
}

感谢每一个提示！

编辑：这是一个扩展的示例代码，包括上述随机采样，但在我的第一个示例中省略了：

# for-loop around my dplyr-code
for (j in 1:runs) {
  simul <- dat %>%
    group_by(groups) %>%
    rowwise() %>%
    mutate(vector_column = case_when(abs(col_A) == (maximum-minimum) ~ list(col_B),sign(col_A) == 1 ~ list(dat$col_B[dat$col_B <= maximum - col_A]),# using list function to store vectors in a data.frame
                                     sign(col_A) != 1 ~ list(dat$col_B[dat$col_B >= minimum + col_A])),helper = !is_empty(vector_column),# in case some of the vectors are empty so it is not possible to use sample
           col_B_new = ifelse(helper,sample(vector_column,1),NA),helper = NULL,sim_data = list(seq(minimum,by = 10))) %>% # define new x-values for later predictions
    ungroup() %>% # get rid of rowwise()
    group_by(groups) %>%
    unnest(sim_data) %>%
    nest(sim_data = sim_data) %>%
    nest(data = c(col_A,col_B_new,fitted = map2(model,col_B_new = sim_data,type = "response"))) %>% # predict values
    unnest(fitted) # unnest predicted values to save in data frame
           
           # save newly fitted values in fitted_sim data frame
           fitted_sim[,1] <- simul$groups
           fitted_sim[,1+j] <- simul$fitted
}

解决方法

不确定从simul数据框架中到底需要什么，但是各个循环元素似乎彼此独立，因此您可以使用parallel:mclappy在其中运行它们平行。在我的示例中，我只使用lapply，但这已经快了很多-如果可以以任何方式帮助您...

library(tidyverse)
library(data.table)
library(microbenchmark)

# making up data
dat <- data.frame("species" =  seq(1:20),"col_A" = runif(20,min=1000,max=2500),"col_B" = runif(20,min = 0,max = 1500),"maximum" = rep(2500,20),"minimum" = rep(1000,groups = rep(LETTERS[1:5],each = 4))

# functions to use with purrr
linear_mod <- function(dat) {
    lm(col_A ~ col_B,data = dat)
}

# define parametres and an empty data frame to use in the for loop
runs <- 10
fitted_sim <- data.frame(matrix(data=NA,nrow=20,ncol=runs+1,byrow=FALSE)) #empty dataframe to contain fitted values for each alt
names(fitted_sim) <- as.factor(seq(1:runs+1))

# I just wrapped your code in a function for benchmarking
run.simul <- function(){
    for (j in 1:runs){
        simul <- dat %>%
            group_by(groups) %>%
            nest(data = c(col_A,col_B,species)) %>%
            mutate(model = map(data,linear_mod),# add model for every group
                   sim_data = list(seq(minimum,maximum,by = 10))) %>% # define new x-values for later predictions
            unnest(sim_data) %>%
            nest(sim_data = sim_data) %>%
            mutate(fitted = map2(model,sim_data,~predict(.x,col_B = sim_data,type = "response"))) %>% # predict values
            unnest(fitted) # unnest predicted values to save in data frame
        # save newly fitted values in fitted_sim data frame
        fitted_sim[,1] <- simul$groups
        fitted_sim[,1+j] <- simul$fitted
    }
    fitted_sim
}


# my version (sorry for using `data.table`...)
Dat <- data.table(dat,key="groups")
getFits <- function(x){
    x[,fitted := predict(lm(col_A ~ col_B),.(list(seq(minimum[1],maximum[1],by = 10)))),by = groups]
    x[,.(species,groups,fitted)]
}

# get the results into a data.frame; use mclapply instead of lapply if you like
dcast(rbindlist(lapply(seq_len(runs),function(z) getFits(data.table(dat,key="groups"))),idcol = "run"),... ~ run,value.var = "fitted")
#>     species groups        1        2        3        4        5        6
#>  1:       1      A 1767.621 1767.621 1767.621 1767.621 1767.621 1767.621
#>  2:       2      A 1376.679 1376.679 1376.679 1376.679 1376.679 1376.679
#>  3:       3      A 1523.100 1523.100 1523.100 1523.100 1523.100 1523.100
#>  4:       4      A 1794.593 1794.593 1794.593 1794.593 1794.593 1794.593
#>  5:       5      B 1272.408 1272.408 1272.408 1272.408 1272.408 1272.408
#>  6:       6      B 1967.709 1967.709 1967.709 1967.709 1967.709 1967.709
#>  7:       7      B 1792.934 1792.934 1792.934 1792.934 1792.934 1792.934
#>  8:       8      B 2318.699 2318.699 2318.699 2318.699 2318.699 2318.699
#>  9:       9      C 2017.187 2017.187 2017.187 2017.187 2017.187 2017.187
#> 10:      10      C 1899.827 1899.827 1899.827 1899.827 1899.827 1899.827
#> 11:      11      C 1734.953 1734.953 1734.953 1734.953 1734.953 1734.953
#> 12:      12      C 1834.046 1834.046 1834.046 1834.046 1834.046 1834.046
#> 13:      13      D 1797.615 1797.615 1797.615 1797.615 1797.615 1797.615
#> 14:      14      D 1915.832 1915.832 1915.832 1915.832 1915.832 1915.832
#> 15:      15      D 1841.489 1841.489 1841.489 1841.489 1841.489 1841.489
#> 16:      16      D 1798.442 1798.442 1798.442 1798.442 1798.442 1798.442
#> 17:      17      E 1641.359 1641.359 1641.359 1641.359 1641.359 1641.359
#> 18:      18      E 1631.253 1631.253 1631.253 1631.253 1631.253 1631.253
#> 19:      19      E 1634.197 1634.197 1634.197 1634.197 1634.197 1634.197
#> 20:      20      E 1634.991 1634.991 1634.991 1634.991 1634.991 1634.991
#>            7        8        9       10
#>  1: 1767.621 1767.621 1767.621 1767.621
#>  2: 1376.679 1376.679 1376.679 1376.679
#>  3: 1523.100 1523.100 1523.100 1523.100
#>  4: 1794.593 1794.593 1794.593 1794.593
#>  5: 1272.408 1272.408 1272.408 1272.408
#>  6: 1967.709 1967.709 1967.709 1967.709
#>  7: 1792.934 1792.934 1792.934 1792.934
#>  8: 2318.699 2318.699 2318.699 2318.699
#>  9: 2017.187 2017.187 2017.187 2017.187
#> 10: 1899.827 1899.827 1899.827 1899.827
#> 11: 1734.953 1734.953 1734.953 1734.953
#> 12: 1834.046 1834.046 1834.046 1834.046
#> 13: 1797.615 1797.615 1797.615 1797.615
#> 14: 1915.832 1915.832 1915.832 1915.832
#> 15: 1841.489 1841.489 1841.489 1841.489
#> 16: 1798.442 1798.442 1798.442 1798.442
#> 17: 1641.359 1641.359 1641.359 1641.359
#> 18: 1631.253 1631.253 1631.253 1631.253
#> 19: 1634.197 1634.197 1634.197 1634.197
#> 20: 1634.991 1634.991 1634.991 1634.991

run.simul()
#>    1        2        3        4        5        6        7        8        9
#> 1  A 1767.621 1767.621 1767.621 1767.621 1767.621 1767.621 1767.621 1767.621
#> 2  A 1376.679 1376.679 1376.679 1376.679 1376.679 1376.679 1376.679 1376.679
#> 3  A 1523.100 1523.100 1523.100 1523.100 1523.100 1523.100 1523.100 1523.100
#> 4  A 1794.593 1794.593 1794.593 1794.593 1794.593 1794.593 1794.593 1794.593
#> 5  B 1272.408 1272.408 1272.408 1272.408 1272.408 1272.408 1272.408 1272.408
#> 6  B 1967.709 1967.709 1967.709 1967.709 1967.709 1967.709 1967.709 1967.709
#> 7  B 1792.934 1792.934 1792.934 1792.934 1792.934 1792.934 1792.934 1792.934
#> 8  B 2318.699 2318.699 2318.699 2318.699 2318.699 2318.699 2318.699 2318.699
#> 9  C 2017.187 2017.187 2017.187 2017.187 2017.187 2017.187 2017.187 2017.187
#> 10 C 1899.827 1899.827 1899.827 1899.827 1899.827 1899.827 1899.827 1899.827
#> 11 C 1734.953 1734.953 1734.953 1734.953 1734.953 1734.953 1734.953 1734.953
#> 12 C 1834.046 1834.046 1834.046 1834.046 1834.046 1834.046 1834.046 1834.046
#> 13 D 1797.615 1797.615 1797.615 1797.615 1797.615 1797.615 1797.615 1797.615
#> 14 D 1915.832 1915.832 1915.832 1915.832 1915.832 1915.832 1915.832 1915.832
#> 15 D 1841.489 1841.489 1841.489 1841.489 1841.489 1841.489 1841.489 1841.489
#> 16 D 1798.442 1798.442 1798.442 1798.442 1798.442 1798.442 1798.442 1798.442
#> 17 E 1641.359 1641.359 1641.359 1641.359 1641.359 1641.359 1641.359 1641.359
#> 18 E 1631.253 1631.253 1631.253 1631.253 1631.253 1631.253 1631.253 1631.253
#> 19 E 1634.197 1634.197 1634.197 1634.197 1634.197 1634.197 1634.197 1634.197
#> 20 E 1634.991 1634.991 1634.991 1634.991 1634.991 1634.991 1634.991 1634.991
#>          10       NA
#> 1  1767.621 1767.621
#> 2  1376.679 1376.679
#> 3  1523.100 1523.100
#> 4  1794.593 1794.593
#> 5  1272.408 1272.408
#> 6  1967.709 1967.709
#> 7  1792.934 1792.934
#> 8  2318.699 2318.699
#> 9  2017.187 2017.187
#> 10 1899.827 1899.827
#> 11 1734.953 1734.953
#> 12 1834.046 1834.046
#> 13 1797.615 1797.615
#> 14 1915.832 1915.832
#> 15 1841.489 1841.489
#> 16 1798.442 1798.442
#> 17 1641.359 1641.359
#> 18 1631.253 1631.253
#> 19 1634.197 1634.197
#> 20 1634.991 1634.991

# benchmarking
microbenchmark(
    a=dcast(rbindlist(lapply(seq_len(runs),value.var = "fitted"),b=run.simul(),times= 10L
    )
#> Unit: milliseconds
#>  expr       min        lq      mean    median       uq      max neval cld
#>     a  46.46851  48.65027  51.09618  49.88775  54.7177  56.2888    10  a 
#>     b 389.76893 410.57588 418.16993 415.74493 424.2042 448.4604    10   b

^{由reprex package（v0.3.0）于2020-08-13创建}

执行许多模拟以避免循环

如何解决执行许多模拟以避免循环

解决方法

相关推荐