如何解决R中的相关矩阵:基于阈值快速删除最少数量的特征
假设我有一个相关矩阵:
library(data.table)
set.seed(1)
a <- matrix(rnorm(1000),nrow = 20,ncol = 50)
colnames(a) <- paste0('var',1:50)
a[1:5,1:5]
# var1 var2 var3 var4 var5
# [1,] -0.62645381 0.91897737 -0.1645236 2.401617761 -0.5686687
# [2,] 0.18364332 0.78213630 -0.2533617 -0.039240003 -0.1351786
# [3,] -0.83562861 0.07456498 0.6969634 0.689739362 1.1780870
# [4,] 1.59528080 -1.98935170 0.5566632 0.028002159 -1.5235668
# [5,] 0.32950777 0.61982575 -0.6887557 -0.743273209 0.5939462
cor_mat <- cor(a)
cor_mat[1:5,1:5]
# var1 var2 var3 var4 var5
# var1 1.0000000 -0.21752487 0.2976402 -0.1523604 -0.37085773
# var2 -0.2175249 1.00000000 -0.2839989 0.1778480 -0.06401162
# var3 0.2976402 -0.28399885 1.0000000 0.2180834 0.13805728
# var4 -0.1523604 0.17784796 0.2180834 1.0000000 -0.27922504
# var5 -0.3708577 -0.06401162 0.1380573 -0.2792250 1.00000000
我想使彼此之间具有绝对相关性值小于阈值的要素的数量尽可能保持最大。这是我所做的:
diag(cor_mat) <- 0
feature_removed <- c()
i = 1
temp = 1
threshold = 0.3
while (max(temp) > 0) {
temp <- apply(cor_mat,1,function(x) {
c(sum(abs(x) > threshold),sum(abs(x)))
})
temp <- as.data.table(t(temp))
temp[,features := colnames(cor_mat)]
setorder(temp,-V1,-V2)
head(temp)
# V1 V2 features
# 1: 18 12.19406 var15
# 2: 15 11.16353 var49
# 3: 15 10.82047 var8
# 4: 14 10.99392 var39
# 5: 13 10.69978 var31
# 6: 13 10.52861 var3
feature_removed[i] <- temp[1,features]
indx <- which(colnames(cor_mat) %in% feature_removed[i])
cor_mat <- cor_mat[-indx,-indx]; gc()
temp <- temp[-1,V1]
i = i+1
}
feature_removed
# [1] "var15" "var49" "var39" "var8" "var38" "var31" "var40" "var41" "var14" "var27" "var22" "var45" "var18" "var11" "var23" "var4" "var42" "var13"
# [19] "var43" "var1" "var34" "var2" "var46" "var3" "var35" "var7" "var5" "var21" "var50" "var12" "var36" "var16" "var44" "var48" "var10" "var47"
# [37] "var25" "var33"
cor_mat <- cor(a)
indx <- which(! colnames(cor_mat) %in% feature_removed)
cor_mat[indx,indx]
# var6 var9 var17 var19 var20 var24 var26 var28 var29 var30 var32 var37
# var6 1.00000000 -0.02263859 -0.13155794 0.07141167 -0.26713987 0.203598429 0.08252259 -0.14443949 0.1837023711 0.2563062065 -0.088011524 -0.19424194
# var9 -0.02263859 1.00000000 -0.11915503 0.01410899 0.15464526 -0.134541757 -0.03644167 -0.02363666 0.0989460178 0.1114800840 0.030616311 0.13378112
# var17 -0.13155794 -0.11915503 1.00000000 -0.10029683 0.17083857 0.227839929 0.06118896 0.05873093 0.0998719164 -0.2568822937 -0.094905918 -0.13075200
# var19 0.07141167 0.01410899 -0.10029683 1.00000000 0.09032814 -0.012338821 0.22697915 -0.07484026 -0.0148609726 0.1932314580 -0.103153852 -0.10459271
# var20 -0.26713987 0.15464526 0.17083857 0.09032814 1.00000000 -0.144317569 -0.03582327 0.14659201 -0.0626693480 -0.1783827493 -0.202125723 -0.03297363
# var24 0.20359843 -0.13454176 0.22783993 -0.01233882 -0.14431757 1.000000000 0.17266009 0.12499469 -0.0762715869 0.0407680053 -0.006395668 0.21197313
# var26 0.08252259 -0.03644167 0.06118896 0.22697915 -0.03582327 0.172660086 1.00000000 -0.12126656 -0.0170290134 0.0443483077 -0.004262910 0.06677876
# var28 -0.14443949 -0.02363666 0.05873093 -0.07484026 0.14659201 0.124994690 -0.12126656 1.00000000 0.1570735783 -0.1143660589 -0.045272571 -0.09274355
# var29 0.18370237 0.09894602 0.09987192 -0.01486097 -0.06266935 -0.076271587 -0.01702901 0.15707358 1.0000000000 -0.0009512571 0.052742960 -0.06363072
# var30 0.25630621 0.11148008 -0.25688229 0.19323146 -0.17838275 0.040768005 0.04434831 -0.11436606 -0.0009512571 1.0000000000 0.195731397 0.04830179
# var32 -0.08801152 0.03061631 -0.09490592 -0.10315385 -0.20212572 -0.006395668 -0.00426291 -0.04527257 0.0527429602 0.1957313969 1.000000000 0.10423393
# var37 -0.19424194 0.13378112 -0.13075200 -0.10459271 -0.03297363 0.211973131 0.06677876 -0.09274355 -0.0636307237 0.0483017908 0.104233926 1.00000000
我必须处理10000 x 10000相关矩阵,并且上述方法非常低。我想知道是否有更快的方法。
非常感谢您!
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。