--- title: "4. Cross-validation by mask matrix/tensor (`kFoldMaskTensor`)" author: - name: Koki Tsuyuzaki affiliation: Laboratory for Bioinformatics Research, RIKEN Center for Biosystems Dynamics Research email: k.t.the-answer@hotmail.co.jp - name: Itoshi Nikaido affiliation: Laboratory for Bioinformatics Research, RIKEN Center for Biosystems Dynamics Research date: "`r Sys.Date()`" bibliography: bibliography.bib package: nnTensor output: rmarkdown::html_vignette vignette: | %\VignetteIndexEntry{4. Cross-validation by mask matrix/tensor (`kFoldMaskTensor`)} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- # Introduction In this vignette, we introduce a cross-validation approach using mask tensor [@mask1; @mask2] to estimate the optimal rank parameter for matrix/tensor decomposition. Mask matrix/tensor has the same size of data matrix/tensor and contains only 0 or 1 elements, 0 means not observed value, and 1 means observed value. In this approach, only non-zero and non-NA elements are randomly specified as 0 and estimated by other elements reconstructed by the result of matrix/tensor decomposition. This can be considered a cross-validation approach because the elements specified as 1 are the training dataset and the elements specified as 0 are the test dataset. Here, we use these packages. ```{r package, echo=TRUE} library("nnTensor") library("ggplot2") library("dplyr") ``` Here, we use three types of demo data as follows: ```{r data, echo=TRUE} data_matrix <- toyModel("NMF") data_matrices <- toyModel("siNMF_Easy") data_tensor <- toyModel("CP") ``` # NMF To set mask matrix/tensor, here we use `kFoldMaskTensor`, which divides only the elements other than NA and 0 into `k` mask matrices/tensors. In `NMF`, the mask matrix can be specified as `M` and the rank parameter (`J`) with the smallest test error is considered the optimal rank. Here, three mask matrices are prepared for each rank, and the average is used to estimate the optimal rank. ```{r nmf_mask, echo=TRUE} out_NMF <- expand.grid(replicate=1:3, rank=factor(1:10), value=0) count <- 1 for(i in 1:10){ masks_NMF <- kFoldMaskTensor(data_matrix, k=3) for(j in 1:3){ out_NMF[count, 3] <- rev( NMF(data_matrix, M = masks_NMF[[j]], J = i)$TestRecError)[1] count <- count + 1 } } ``` Looking at the average test error for each rank, the optimal rank appears to be around 8 to 10 (with some variation depending on random seeds). ```{r nmf_plot, echo=TRUE, fig.width=10, fig.height=4} ggplot(out_NMF, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error") ``` ```{r nmf_min, echo=TRUE} (group_by(out_NMF, rank) |> summarize(Avg = mean(value)) -> avg_test_error_NMF) avg_test_error_NMF[which(avg_test_error_NMF$Avg == min(avg_test_error_NMF$Avg))[1], ] ``` # NMTF Same as `NMF`, mask matrix `M` can be specified in `NMTF`, and the rank parameter (`rank`) with the smallest test error is considered the optimal rank. The following code is time-consuming and should be executed in your own environment. ```{r nmtf_mask, echo=TRUE, eval=FALSE} out_NMTF <- expand.grid(replicate=1:3, rank2=1:10, rank1=1:10, value=0) rank_NMTF <- paste0(out_NMTF$rank1, "-", out_NMTF$rank2) out_NMTF <- cbind(out_NMTF, rank=factor(rank_NMTF, levels=unique(rank_NMTF))) count <- 1 for(i in 1:10){ for(j in 1:10){ masks_NMTF <- kFoldMaskTensor(data_matrix, k=3) for(k in 1:3){ out_NMTF[count, 4] <- rev( NMTF(data_matrix, M = masks_NMTF[[k]], rank = c(i, j))$TestRecError)[1] count <- count + 1 } } } ``` ```{r nmtf_plot, echo=TRUE, eval=FALSE, fig.width=10, fig.height=4} ggplot(out_NMTF, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ``` ```{r nmtf_min, echo=TRUE, eval=FALSE} (out_NMTF |> group_by(rank1, rank2) |> summarize(Avg = mean(value)) -> avg_test_error_NMTF) avg_test_error_NMTF[which(avg_test_error_NMTF$Avg == min(avg_test_error_NMTF$Avg))[1], ] ``` # siNMF Same as `NMF`, mask matrices `M` can be specified in `siNMF`, and the rank parameter (`J`) with the smallest test error is considered the optimal rank. The following code is time-consuming and should be executed in your own environment. ```{r sinmf_mask, echo=TRUE, eval=FALSE} out_siNMF <- expand.grid(replicate=1:3, rank=factor(1:10), value=0) count <- 1 for(i in 1:10){ masks_siNMF <- lapply(1:3, function(x){ list( kFoldMaskTensor(data_matrices[[1]], k=3)[[x]], kFoldMaskTensor(data_matrices[[2]], k=3)[[x]], kFoldMaskTensor(data_matrices[[3]], k=3)[[x]]) }) for(j in 1:3){ out_siNMF[count, 3] <- rev( siNMF(data_matrices, M = masks_siNMF[[j]], J = i)$TestRecError)[1] count <- count + 1 } } ``` ```{r sinmf_plot, echo=TRUE, eval=FALSE, fig.width=10, fig.height=4} ggplot(out_siNMF, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error") ``` ```{r sinmf_min, echo=TRUE, eval=FALSE} (out_siNMF |> group_by(rank) |> summarize(Avg = mean(value)) -> avg_test_error_siNMF) avg_test_error_siNMF[which(avg_test_error_siNMF$Avg == min(avg_test_error_siNMF$Avg))[1], ] ``` # jNMF Same as `NMF`, mask matrices `M` can be specified in `jNMF`, and the rank parameter (`J`) with the smallest test error is considered the optimal rank. The following code is time-consuming and should be executed in your own environment. ```{r jnmf_mask, echo=TRUE, eval=FALSE} out_jNMF <- expand.grid(replicate=1:3, rank=factor(1:10), value=0) count <- 1 for(i in 1:10){ masks_jNMF <- lapply(1:3, function(x){ list( kFoldMaskTensor(data_matrices[[1]], k=3)[[x]], kFoldMaskTensor(data_matrices[[2]], k=3)[[x]], kFoldMaskTensor(data_matrices[[3]], k=3)[[x]]) }) for(j in 1:3){ out_jNMF[count, 3] <- rev( jNMF(data_matrices, M = masks_jNMF[[j]], J = i)$TestRecError)[1] count <- count + 1 } } ``` ```{r jnmf_plot, echo=TRUE, eval=FALSE, fig.width=10, fig.height=4} ggplot(out_jNMF, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error") ``` ```{r jnmf_min, echo=TRUE, eval=FALSE} (out_jNMF |> group_by(rank) |> summarize(Avg = mean(value)) -> avg_test_error_jNMF) avg_test_error_jNMF[which(avg_test_error_jNMF$Avg == min(avg_test_error_jNMF$Avg))[1], ] ``` # NTF Same as `NMF`, mask tensor `M` can be specified in `NTF`, and the rank parameter (`rank`) with the smallest test error is considered the optimal rank. The following code is time-consuming and should be executed in your own environment. ```{r ntf_mask, echo=TRUE, eval=FALSE} out_NTF <- expand.grid(replicate=1:3, rank=factor(1:10), value=0) count <- 1 for(i in 1:10){ masks_NTF <- kFoldMaskTensor(data_tensor, k=3) for(j in 1:3){ out_NTF[count, 3] <- rev( NTF(data_tensor, M = masks_NTF[[j]], rank = i)$TestRecError)[1] count <- count + 1 } } ``` ```{r ntf_plot, echo=TRUE, eval=FALSE, fig.width=10, fig.height=4} ggplot(out_NTF, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error") ``` ```{r ntf_min, echo=TRUE, eval=FALSE} (group_by(out_NTF, rank) |> summarize(Avg = mean(value)) -> avg_test_error_NTF) avg_test_error_NTF[which(avg_test_error_NTF$Avg == min(avg_test_error_NTF$Avg))[1], ] ``` # NTD Same as `NMF`, mask tensor `M` can be specified in `NTD`, and the rank parameter (`rank`) with the smallest test error is considered the optimal rank. The following code is time-consuming and should be executed in your own environment. ```{r ntd_mask, echo=TRUE, eval=FALSE} out_NTD <- expand.grid(replicate=1:3, rank3=1:5, rank2=1:5, rank1=1:5, value=0) rank_NTD <- paste0(out_NTD$rank1, "-", out_NTD$rank2, "-", out_NTD$rank3) out_NTD <- cbind(out_NTD, rank=factor(rank_NTD, levels=unique(rank_NTD))) count <- 1 for(i in 1:5){ for(j in 1:5){ for(k in 1:5){ masks_NTD <- kFoldMaskTensor(data_tensor, k=3) for(k in 1:3){ out_NTD[count, 5] <- rev( NTD(data_tensor, M = masks_NTD[[k]], rank = c(i, j, k))$TestRecError)[1] count <- count + 1 } } } } ``` ```{r ntd_plot, echo=TRUE, eval=FALSE, fig.width=10, fig.height=4} ggplot(out_NTD, aes(x=rank, y=value)) + geom_point() + stat_summary(fun = mean, geom = "point", shape=21, size=3, fill="blue") + stat_summary(fun = mean, geom = "line", colour = "blue", aes(group=1)) + xlab("Rank") + ylab("Test Reconstruction Error") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ``` ```{r ntd_min, echo=TRUE, eval=FALSE} (out_NTD |> group_by(rank1, rank2, rank3) |> summarize(Avg = mean(value)) -> avg_test_error_NTD) avg_test_error_NTD[which(avg_test_error_NTD$Avg == min(avg_test_error_NTD$Avg))[1], ] ``` # Session Information {.unnumbered} ```{r sessionInfo, echo=FALSE} sessionInfo() ``` # References