[R] How do we do correlation for big matrices?

Marna Wagley marna.wagley at gmail.com
Sat Dec 26 19:55:39 CET 2015

```Hi R users,
I have a very big two matrices of 12 columns and over 0.5 million columns
(50,4710) and trying to get correlation value between two tables but I
could not compute it because of big files.
Would you give me any suggestion on how I can do the correlations for the
big files?

I used the following codes and the example data.

df1<-structure(list(X = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 2L, 3L, 4L), .Label = c("env1", "env10", "env11", "env12",
"env2", "env3", "env4", "env5", "env6", "env7", "env8", "env9"
), class = "factor"), site1 = c(0.38, 0.83, 0.53, 0.48, 0.66,
0.09, 0.21, 0.02, 0.76, 0.62, 0.2, 0.47), site2 = c(0.19, 0.14,
0.66, 0.35, 0.18, 0.24, 0.18, 0.2, 0.86, 0.06, 0.51, 0.29), site3 = c(0.95,
0.51, 0.91, 0.48, 0.74, 0.67, 0.34, 0.72, 0.43, 0.49, 0.1, 0.48
), site4 = c(0.89, 0.54, 0.93, 0.18, 0.99, 0.21, 0.69, 0.29,
0.89, 0.84, 0.45, 0.2), site5 = c(0.38, 0.37, 0.01, 0.26, 0.97,
0.49, 0.39, 0.31, 0.14, 0.83, 0.99, 0.2), site6 = c(0.68, 0.67,
0.6, 0.92, 0.01, 0.04, 0.49, 0.38, 0.5, 0.37, 0.51, 0.17), site7 = c(0.08,
0.54, 0.31, 0.3, 0.77, 0.39, 0.03, 0.51, 0.28, 0.32, 0.86, 0.95
), site8 = c(0.54, 0.26, 0.87, 0.91, 0.12, 0.51, 0.31, 0.67,
0.69, 0.79, 0.76, 0.08), site9 = c(0.1, 0.68, 0.17, 0.44, 0.78,
0.9, 0.16, 0.31, 0.13, 0.34, 0.9, 0.16), site10 = c(0.53, 0.31,
0.88, 0.61, 0.92, 0.44, 0.92, 0.94, 0.55, 0.8, 0.27, 0.07)), .Names =
c("X",
"site1", "site2", "site3", "site4", "site5", "site6", "site7",
"site8", "site9", "site10"), class = "data.frame", row.names = c(NA,
-12L))
df1<-df1[-1]

df2<-structure(list(X = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 2L, 3L, 4L), .Label = c("env1", "env10", "env11", "env12",
"env2", "env3", "env4", "env5", "env6", "env7", "env8", "env9"
), class = "factor"), site1 = c(0.36, 0.29, 0.09, 0.07, 0.82,
0.88, 0.59, 0.57, 0.2, 0.29, 0.76, 0.2), site2 = c(0.91, 0.87,
0.91, 0.54, 0.53, 0.2, 0.23, 0.16, 0.42, 0.44, 0.01, 0.29), site3 = c(0.96,
0.56, 0.34, 0.34, 0.6, 0.63, 0.28, 0.25, 0.73, 0.45, 0.88, 0.39
), site4 = c(0.73, 0.79, 0.39, 0.59, 0.63, 0.24, 0.69, 0.94,
0.07, 0.23, 0.01, 0.99), site5 = c(0.88, 0.18, 0.37, 0.24, 0.61,
0.61, 0.54, 0.71, 0.12, 0.82, 0.26, 0.5), site6 = c(0.43, 0.52,
0.01, 0.76, 0.41, 0.57, 0.08, 0.75, 0.82, 0.98, 0.61, 0.74),
site7 = c(0.84, 0.14, 0.96, 0.04, 0.41, 0.84, 0.26, 0.59,
0.29, 0.3, 0.76, 0.05), site8 = c(0.12, 0.18, 0.75, 0.23,
0.96, 0.64, 0.33, 0.61, 0.25, 0.13, 0.99, 0.6), site9 = c(0.26,
0.58, 0.32, 0.67, 0.11, 0.8, 0.87, 0.05, 0.03, 0.47, 0.95,
0.81), site10 = c(0.94, 0.63, 0.64, 0.5, 0.94, 0.75, 0.44,
0.57, 0.19, 0.23, 0.08, 0.18)), .Names = c("X", "site1",
"site2", "site3", "site4", "site5", "site6", "site7", "site8",
"site9", "site10"), class = "data.frame", row.names = c(NA, -12L
))
df2<-df2[-1]
df2
# here I put only 12 columns, but as I mentioned above I have more than 1/2
million columns
cor_site<-data.matrix(diag(cor(df1,df2)))
It works fine for a small data but this big files did not work.