Re: [问题] data.table取出符合条件的资料

楼主: celestialgod (天)   2014-04-06 04:31:51
我有兴趣知道size较大时,哪个速度比较快
写了一个小程式测试:
library(data.table)
library(dplyr)
library(fastmatch)
library(Rcpp)
library(microbenchmark)
library(rbenchmark)
perf_test = function(N){
tmp <- list()
for(i in 1:N) tmp[[i]] <- iris
m <- do.call(rbind, tmp)
m2 = data.table(m)
setkey(m2, "Sepal.Width")
m3 = as.matrix(m[,1:4])
benchmark(replications=100,
m[m$Sepal.Width == 3.5,],
subset(m, Sepal.Width == 3.5),
m2[J(3.5)],
filter(m, Sepal.Width == 3.5),
filter(m2, Sepal.Width == 3.5),
m2[list(3.5)],
m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0L),],
m3[m3[,2]==3.5,],
columns = c("test", "replications", "elapsed", "relative")
)
}
# iris的大小
object.size(iris)
# 7088 bytes
# 200倍的资料量
perf_test(200)
test replications elapsed relative
4 filter(m, Sepal.Width == 3.5) 100 0.05 1.0
5 filter(m2, Sepal.Width == 3.5) 100 0.14 2.8
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ] 100 0.25 5.0
1 m[m$Sepal.Width == 3.5, ] 100 0.44 8.8
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)] 100 0.33 6.6
3 m2[J(3.5)] 100 0.17 3.4
6 m2[list(3.5)] 100 0.14 2.8
9 m3[m3[, 2] == 3.5, ] 100 0.22 4.4
2 subset(m, Sepal.Width == 3.5) 100 0.55 11.0
# 500倍的资料量
perf_test(500)
test replications elapsed relative
4 filter(m, Sepal.Width == 3.5) 100 0.15 1.000
5 filter(m2, Sepal.Width == 3.5) 100 0.16 1.067
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ] 100 0.71 4.733
1 m[m$Sepal.Width == 3.5, ] 100 1.13 7.533
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)] 100 0.75 5.000
3 m2[J(3.5)] 100 0.19 1.267
6 m2[list(3.5)] 100 0.16 1.067
9 m3[m3[, 2] == 3.5, ] 100 0.50 3.333
2 subset(m, Sepal.Width == 3.5) 100 1.26 8.400
# 1000倍的资料量
perf_test(1000)
test replications elapsed relative
4 filter(m, Sepal.Width == 3.5) 100 0.27 1.929
5 filter(m2, Sepal.Width == 3.5) 100 0.21 1.500
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ] 100 1.09 7.786
1 m[m$Sepal.Width == 3.5, ] 100 1.92 13.714
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)] 100 0.97 6.929
3 m2[J(3.5)] 100 0.15 1.071
6 m2[list(3.5)] 100 0.14 1.000
9 m3[m3[, 2] == 3.5, ] 100 0.83 5.929
2 subset(m, Sepal.Width == 3.5) 100 2.31 16.500
# 1500倍的资料量
perf_test(1500)
test replications elapsed relative
4 filter(m, Sepal.Width == 3.5) 100 0.45 2.25
5 filter(m2, Sepal.Width == 3.5) 100 0.31 1.55
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ] 100 1.76 8.80
1 m[m$Sepal.Width == 3.5, ] 100 3.11 15.55
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)] 100 1.81 9.05
3 m2[J(3.5)] 100 0.20 1.00
6 m2[list(3.5)] 100 0.21 1.05
9 m3[m3[, 2] == 3.5, ] 100 2.06 10.30
2 subset(m, Sepal.Width == 3.5) 100 3.60 18.00
# 3000倍的资料量
perf_test(3000)
test replications elapsed relative
4 filter(m, Sepal.Width == 3.5) 100 0.82 4.10
5 filter(m2, Sepal.Width == 3.5) 100 0.50 2.50
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ] 100 3.47 17.35
1 m[m$Sepal.Width == 3.5, ] 100 7.13 35.65
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)] 100 3.79 18.95
3 m2[J(3.5)] 100 0.20 1.00
6 m2[list(3.5)] 100 0.22 1.10
9 m3[m3[, 2] == 3.5, ] 100 2.93 14.65
2 subset(m, Sepal.Width == 3.5) 100 7.39 36.95
# 5000倍的资料量
perf_test(5000)
test replications elapsed relative
4 filter(m, Sepal.Width == 3.5) 100 1.46 5.214
5 filter(m2, Sepal.Width == 3.5) 100 0.84 3.000
7 m[fmatch(m$Sepal.Width, 3.5, nomatch = 0), ] 100 6.46 23.071
1 m[m$Sepal.Width == 3.5, ] 100 10.71 38.250
8 m2[fmatch(m2$Sepal.Width, 3.5, nomatch = 0)] 100 7.37 26.321
3 m2[J(3.5)] 100 0.28 1.000
6 m2[list(3.5)] 100 0.34 1.214
9 m3[m3[, 2] == 3.5, ] 100 4.96 17.714
2 subset(m, Sepal.Width == 3.5) 100 13.67 48.821
总结:
在资料量在3544000 bytes左右为分界,以下是filter + data.frame比较快
以上则是m2[J(3.5)] 跟 m2[list(3.5)]比较快
补上平台:windows 7 64 bit SP1, R 3.0.3, i7-3700K@4.3GHz
作者: tokyo291 (工口工口)   2014-04-06 04:49:00
大推XDDD资料量小的时候R被matlab惨电QQ
作者: clickhere (It's time to go home.)   2014-04-06 11:26:00
拿data.frame比matrix?!m<-as.matrix(iris[,-5])应该会快不少,且接近matlab.还大小通吃. 反正都要改type,一样少不了memory copy.感谢.

Links booklink

Contact Us: admin [ a t ] ucptt.com