mi/ 0000755 0001762 0000144 00000000000 15055520652 010664 5 ustar ligges users mi/tests/ 0000755 0001762 0000144 00000000000 12513713722 012024 5 ustar ligges users mi/tests/missing_data.frame.R 0000644 0001762 0000144 00000001575 12513723341 015710 0 ustar ligges users stopifnot(require(mi))
rdf <- rdata.frame(N = 100, n_partial = 2, n_full = 2)
mdf <- missing_data.frame(rdf$obs)
rdf <- rdata.frame(N = 100, n_partial = 6, n_full = 1,
types = c("ordinal", "cont", "count", "binary",
"proportion", "positive", "nominal"))
mdf <- missing_data.frame(rdf$obs)
mdf <- missing_data.frame(rdf$obs, favor_positive = TRUE)
rdf <- rdata.frame(N = 100, n_partial = 5, n_full = 1, experiment = TRUE,
types = c("treatment", "cont", "count", "binary",
"proportion", "positive"))
mdf <- missing_data.frame(rdf$obs, subclass = "experiment", concept =
as.factor(c("treatment", rep("covariate", 4), "outcome")))
rdf <- rdata.frame(N = 100, n_partial = 5, n_full = 0, types = "ordinal")
mdf <- missing_data.frame(rdf$obs, subclass = "allcategorical")
mi/tests/missing_variable.R 0000644 0001762 0000144 00000002107 12513714366 015472 0 ustar ligges users stopifnot(require(mi))
x <- rnorm(10)
x[1] <- NA
y <- missing_variable(x, type = "continuous")
y <- missing_variable(x, type = "irrelevant")
x <- rep(1, 10)
y <- missing_variable(x, type = "fixed")
x <- rep(1:5, each = 2)
y <- missing_variable(x, type = "group")
x[1] <- NA
y <- missing_variable(x, type = "unordered-categorical")
y <- missing_variable(x, type = "ordered-categorical")
y <- missing_variable(x, type = "interval")
x <- rbinom(10, size = 1, prob = 0.5)
x[1] <- NA
y <- missing_variable(x, type = "binary")
y <- missing_variable(x, type = "grouped-binary", strata = rep(c("A", "B"), each = 5))
x <- runif(10)
x[1] <- NA
y <- missing_variable(x, type = "bounded-continuous", lower = 0, upper = 1)
y <- missing_variable(x, type = "positive-continuous")
y <- missing_variable(x, type = "proportion")
x[which.min(x)] <- 0
y <- missing_variable(x, type = "nonnegative-continuous")
y <- missing_variable(x, type = "SC_proportion")
x[which.max(x)] <- 1
y <- missing_variable(x, type = "SC_proportion")
x <- rpois(10, lambda = 5)
x[1] <- NA
y <- missing_variable(x, type = "count")
mi/MD5 0000644 0001762 0000144 00000006732 15055520652 011204 0 ustar ligges users 1bddc4f904809dc4fa802256e478a697 *DESCRIPTION
3cff9345a28fcf308d27cd4fb1024fc2 *NAMESPACE
d40a3d06f38c44bdd479a06006eeca4d *R/AllClass.R
3ce3aa32589799fc9cc255946ced225b *R/AllGeneric.R
54582ada9780d1d0a788d3f82c90ebfb *R/change.R
039e4c0888302c0ab552e8407ff21dd1 *R/change_family.R
18dc62aa8a9efe1667e0e4ed1f341d71 *R/change_imputation_method.R
c432254da0cc432e7c813d5ab815eb5c *R/change_link.R
5e56fe5aa1f5a955a3422e80a4cfdda1 *R/change_model.R
586ede7677395ec0f91abbc2dc33a0ac *R/change_size.R
a7553ff494bd3c261a84ca4b4ad39b6b *R/change_transformation.R
91374c98b46448341e2a46956d9d71c9 *R/change_type.R
dfd85e6c34acecb9b4a7a4af970e2b6b *R/complete.R
339925dbf362cf49d2f22e48edb9c845 *R/convenience.R
2f6f8dab8d88b66a080c39aa1bb80579 *R/debug.R
6eff90fe590a73070e77c5d305ff1c8b *R/fit_model.R
f7dfe0685bf2085ec8dcf96943dbabdb *R/get_parameters.R
50d72e195ba8724cb42a226f74de4649 *R/hist_methods.R
af4c014536a23866f57add672aad7b96 *R/mi.R
9c9b6aeabc5b47d11f268a5caf91ee0c *R/misc.R
1442c0b84378b7fed9497b8d7dfa4804 *R/missing_data.frame.R
ccec62837773157bbdea42e4cbed0487 *R/missing_variable.R
e460dbaa09804ae44f75c202d33302ff *R/plot_methods.R
176c56f95ae64b9803d6bbe923321a21 *R/pool.R
1f0960308ae1258cacd265ee621d4a5d *R/random_df.R
a0a5aef6f895a573d55af58cf42f5d3a *R/sysdata.rda
ff146a33b7e5b018a84868092721901d *R/tobin5.R
ae4d04291a56901290e61570fe88d109 *R/zzz.R
03a6049ae53674cdb496abc0b4001da7 *build/partial.rdb
790c28f7f6349818b757aa3c38acc2cf *build/vignette.rds
379ddfae591ad8ea6fabfd77c1d617ec *data/CHAIN.RData
f0d0be59b38944ccb05c7db05f83919e *data/nlsyV.RData
49c2291ac7f09d1637fcab7878765298 *inst/CITATION
6a8216b055a61e0b2d938e2af41f6abb *inst/doc/mi_vignette.R
3bb603698bbb07d97015c41d35f18d4d *inst/doc/mi_vignette.Rmd
6f07a39277e7b823e55061469132fd06 *inst/doc/mi_vignette.pdf
9681419f7f3ea8cbf1e49216d6afd458 *man/00mi-package.Rd
466b9739a71d35f52fb377d69ce5bfe8 *man/01missing_variable.Rd
b406b6e3b54fd23330f1f67eda72aa80 *man/02missing_data.frame.Rd
62cfd3ae53c7fe548bcc066e97da961f *man/03change.Rd
588665cf7644c8632efab60fc9713990 *man/04mi.Rd
4f3212bf4cb51aeaf2c2d1e8bd35c56e *man/05Rhats.Rd
a7081eae2613236aac735bf21148e442 *man/06pool.Rd
a7ad72bef3c84337ab27f298f1a76604 *man/07complete.Rd
ad8d0f17211c20431d4c6c59b47eec55 *man/CHAIN.Rd
4a45140aee0a1fbd45df9fd13fa588da *man/allcategorical_missing_data.frame.Rd
6fb0d4bdc81aed727ced3b3a5118e2a0 *man/bounded.Rd
85cf2151c8d5d8bf7e267905ca56d267 *man/categorical.Rd
12fcc6e58a9b9257f5b393a1f51e3f77 *man/censored-continuous.Rd
7ec42b8bb866483c2dcabe15f377f796 *man/continuous.Rd
9f73b3e29f8b96367c861066ad218399 *man/count.Rd
4cc12091ac5adda3eb0ebde9faba8c98 *man/experiment_missing_data.frame.Rd
f18f64b52237f415bec92493213821d9 *man/fit_model.Rd
c97ec81190f1c18478e2d48014a312a9 *man/get_parameters.Rd
406b1e3fab5930b77e4cab8cc46a3586 *man/hist.Rd
5b1544026cdd649d2a90a8af69faaad4 *man/irrelevant.Rd
1595a8710c4ec7b4b7525e12e350292c *man/mi-internal.Rd
84fc689124c97d9fb2eed41706010615 *man/mi2stata.Rd
0522e2704f2ac6e3c01595a37b993f90 *man/mipply.Rd
520f42e38cf48115186e179da03a3585 *man/multilevel_missing_data.frame.Rd
6f389295b9770e77ee6804aba47f2ad0 *man/multinomial.Rd
ec518486b5e8d55d014eaae022845e02 *man/nlsyV.Rd
55fb1ae5649deec7a3b7268968966d8d *man/positive.Rd
fa71e7cf4f93a993407164ec993fadde *man/rdata.frame.Rd
d29dd537c8ef0895f39db27059264737 *man/semi-continuous.Rd
2b34d0c66bd45e044b5fe119737f7540 *tests/missing_data.frame.R
276644cf2dc960b51d26396d74b87c15 *tests/missing_variable.R
3bb603698bbb07d97015c41d35f18d4d *vignettes/mi_vignette.Rmd
mi/R/ 0000755 0001762 0000144 00000000000 14247027226 011066 5 ustar ligges users mi/R/convenience.R 0000644 0001762 0000144 00000010450 12513634171 013502 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
## Some S3 methods for convenience
as.double.missing_variable <-
function(x, ...) {
stop("you must write an 'as.double' method for the", class(x), "class")
}
as.double.categorical <-
function(x, ...) {
x@data
}
as.double.continuous <-
function(x, transformed = TRUE, ...) {
if(transformed) x@data
else x@inverse_transformation(x@data)
}
as.double.count <-
function(x, ...) {
x@data
}
as.double.irrelevant <-
function(x, ...) {
as.double(x@raw_data)
}
as.double.missing_data.frame <-
function(x, transformed = TRUE, ...) {
sapply(x@variables, as.double, transformed = transformed)
}
as.data.frame.missing_data.frame <-
function(x, row.names = NULL, optional = FALSE, ...) {
as.data.frame(lapply(x@variables, FUN = function(y) y@raw_data),
row.names = if(is.null(row.names)) rownames(x) else row.names)
}
dim.missing_data.frame <-
function(x) {
x@DIM
}
dimnames.missing_data.frame <-
function(x) {
x@DIMNAMES
}
names.missing_data.frame <-
function(x) {
x@DIMNAMES[[2]]
}
dim.mi <-
function(x) {
if(isS4(x)) x@data[[1]]@DIM else {
class(x) <- "list"
return(dim(x))
}
}
dimnames.mi <-
function(x) {
if(isS4(x)) x@data[[1]]@DIMNAMES else {
class(x) <- "list"
return(dimnames(x))
}
}
names.mi <-
function(x) {
if(isS4(x)) x@data[[1]]@DIMNAMES[[2]] else {
class(x) <- "list"
return(names(x))
}
}
is.na.missing_variable <-
function(x) {
out <- rep(FALSE, x@n_total)
out[x@which_miss] <- TRUE
return(out)
}
is.na.missing_data.frame <-
function(x) {
sapply(x@variables, is.na)
}
is.na.mi <-
function(x) {
if(isS4(x)) is.na(x@data[[1]]) else {
class(x) <- "list"
return(is.na(x))
}
}
length.missing_variable <-
function(x) {
x@n_total
}
length.missing_data.frame <-
function(x) {
ncol(x)
}
length.mi <-
function(x) {
if(isS4(x)) length(x@data) else {
class(x) <- "list"
return(length(x))
}
}
print.mdf_list <-
function(x ,...) {
show(x)
}
print.mi_list <-
function(x, ...) {
show(x)
}
"[.missing_data.frame" <-
function(x, i, j, drop = if (missing(i)) TRUE else length(j) == 1) {
if(!missing(i)) {
cdf <- complete(x, m = 0L)
if(!missing(j)) return(cdf[i,j,drop = drop])
else return(cdf[i,,drop = drop])
}
else if(length(j) > 1) return(new(class(x), variables = x@variables[j]))
else if(is.numeric(j) && j < 0) return(new(class(x), variables = x@variables[j]))
else return(x@variables[[j]])
}
"[<-.missing_data.frame" <-
function (x, i, j, value) {
if(!missing(i)) {
if(!missing(j)) x@variables[[j]]@raw_data[i,] <- value
else stop("a variable (column) must be specified when replacing")
}
else if(is.null(value)) x@variables[j] <- value
else if(is(value, "missing_variable")) x@variables[[j]] <- value
else stop("replacement must be 'NULL' or a 'missing_variable'")
return(new(class(x), variables = x@variables))
}
"[[.missing_data.frame" <-
function(x, ..., exact = TRUE) {
return(x[,...])
}
"[[<-.missing_data.frame" <-
function (x, i, j, value) {
if(missing(j)) x[,i] <- value
else x[i,j] <- value
return(x)
}
"$.missing_data.frame" <-
function(x, name) {
return(x[,name])
}
"$<-.missing_data.frame" <-
function(x, name, value) { # this never gets dispatched for some reason
x[,name] <- value
return(x)
}
mi/R/complete.R 0000644 0001762 0000144 00000014137 12513634171 013024 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
## These functions extract completed data
setMethod("complete", signature(y = "missing_variable", m = "integer"), def =
function(y, m, ...) {
out <- y@data
if(m > 0 & y@n_drawn) out[y@which_drawn] <- as.numeric(y@imputations[m,])
return(out)
})
setMethod("complete", signature(y = "irrelevant", m = "integer"), def =
function(y, m, ...) {
return(y@raw_data)
})
setMethod("complete", signature(y = "categorical", m = "integer"), def =
function(y, m, to_factor = TRUE, ...) {
out <- y@data
if(m > 0 & y@n_drawn) out[y@which_drawn] <- as.numeric(y@imputations[m,])
if(to_factor) {
out <- factor(out, ordered = is(y, "ordered-categorical"))
levels(out) <- y@levels
}
return(out)
})
setMethod("complete", signature(y = "binary", m = "integer"), def =
function(y, m, to_factor = TRUE, ...) {
out <- y@data
if(m > 0 & y@n_drawn) out[y@which_drawn] <- as.numeric(y@imputations[m,])
if(to_factor) {
out <- factor(out, ordered = FALSE)
levels(out) <- y@levels
}
return(out)
})
setMethod("complete", signature(y = "continuous", m = "integer"), def =
function(y, m, transform = TRUE, ...) {
out <- y@data
if(m > 0 & y@n_drawn) out[y@which_drawn] <- as.numeric(y@imputations[m,])
if(transform) out <- y@inverse_transformation(out)
return(out)
})
setMethod("complete", signature(y = "nonnegative-continuous", m = "integer"), def =
function(y, m, transform = TRUE, ...) {
out <- y@data
if(m > 0 & y@n_drawn) out[y@which_drawn] <- as.numeric(y@imputations[m,])
if(transform) {
out <- y@inverse_transformation(out)
out[y@raw_data == 0] <- 0
}
return(out)
})
setMethod("complete", signature(y = "SC_proportion", m = "integer"), def =
function(y, m, transform = TRUE, ...) {
out <- y@data
if(m > 0 & y@n_drawn) out[y@which_drawn] <- as.numeric(y@imputations[m,])
if(transform) out <- y@inverse_transformation(out)
out[y@raw_data == 0] <- 0
out[y@raw_data == 1] <- 1
return(out)
})
setMethod("complete", signature(y = "missing_data.frame", m = "integer"), def =
function(y, m, to_matrix = FALSE, include_missing = TRUE) {
if(to_matrix) out <- sapply(y@variables, complete, m = m, to_factor = FALSE, transform = FALSE)
else out <- as.data.frame(lapply(y@variables, complete, m = m, to_factor = TRUE, transform = TRUE))
if(is(y, "allcategorical_missing_data.frame")) {
out <- cbind(out, latents = complete(y@latents, m = m, to_factor = !to_matrix))
}
if(include_missing) {
M <- is.na(y)[,!sapply(y@variables, FUN = function(y) y@all_obs), drop = FALSE]
colnames(M) <- paste("missing", colnames(M), sep = "_")
out <- cbind(out, M)
}
return(out)
})
setMethod("complete", signature(y = "mi", m = "numeric"), def =
function(y, m = length(y), to_matrix = FALSE, include_missing = TRUE) {
stopifnot(m == as.integer(m))
m <- as.integer(m)
l <- length(y@data)
draws <- sum(y@total_iters)
if(length(m) > 1) out <- lapply(y@data[m], complete, m = 0L, to_matrix = to_matrix, include_missing = include_missing)
else if(m == 1) out <- complete(y@data[[1]], m = 0L, to_matrix = to_matrix, include_missing = include_missing) # not a list
else if(m <= l) out <- lapply(y@data[1:m], complete, m = 0L, to_matrix = to_matrix, include_missing = include_missing)
else { # wants more completed datasets than chains
quotient <- m %/% l
remainder <- m %% l
num <- quotient + (1:l <= remainder)
out <- vector("list", m)
count <- 1
for(i in seq_along(y@data)) {
if(num[i] == 1) {
out[[count]] <- complete(y@data[[i]], m = 0L, to_matrix = to_matrix, include_missing = include_missing)
count <- count + 1
}
else { # double-dip from a chain
SEQ <- seq(from = ceiling(draws / 2), to = draws, length.out = num[i])
temp <- sapply(SEQ, FUN = function(j) complete(y@data[[i]], m = as.integer(j), to_matrix = to_matrix,
include_missing = include_missing), simplify = FALSE)
for(j in seq_along(temp)) {
out[[count]] <- temp[[j]]
count <- count + 1
}
}
}
}
return(out)
})
setMethod("complete", signature(y = "mi", m = "missing"), def =
function(y, to_matrix = FALSE, include_missing = TRUE) {
return(complete(y, m = length(y), to_matrix = to_matrix, include_missing = include_missing))
})
setMethod("complete", signature(y = "mi_list", m = "numeric"), def =
function(y, m = length(y[[1]]), to_matrix = FALSE, include_missing = TRUE) {
temp <- lapply(y, FUN = complete, m = m, to_matrix = to_matrix, include_missing = include_missing)
dfs <- temp[[1]]
if(length(m) == 1 && m == 1 && length(temp) > 1) for(i in 2:length(temp)) {
dfs <- rbind(dfs, temp[[i]])
}
else if(length(temp) > 1) for(i in 2:length(temp)) for(j in 1:length(dfs)) {
dfs[[j]] <- rbind(dfs[[j]], temp[[i]][[j]])
}
return(dfs)
})
setMethod("complete", signature(y = "mi_list", m = "missing"), def =
function(y, to_matrix = FALSE, include_missing = TRUE) {
return(complete(y, m = length(y[[1]]), to_matrix = to_matrix, include_missing = include_missing))
})
mi/R/hist_methods.R 0000644 0001762 0000144 00000025112 12513634171 013701 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
setMethod("hist", signature(x = "missing_variable"), def =
function(x, ...) {
y <- x@data
NAs <- is.na(x)
h_all <- hist(y, plot = FALSE)
plot(h_all, border = "lightgray", main = "", xlab = if(x@done) "Completed" else "Observed", axes = FALSE,
mgp = c(2, 1, 0), tcl = .05, col = if(x@done) "lightgray" else "blue", freq = TRUE, ...)
axis(1, lwd = 0)
axis(2)
if(x@done) {
h_obs <- hist(y[!NAs], breaks = h_all$breaks, plot = FALSE)
h_miss <- hist(y[NAs], breaks = h_all$breaks, plot = FALSE)
segments(h_obs$breaks[1], 0, y1 = h_obs$counts[1], col = "blue")
segments(h_miss$breaks[1], 0, y1 = h_miss$counts[1], col = "red")
segments(h_obs$breaks[1], y0 = h_obs$counts[1], x1 = h_obs$breaks[2], col = "blue")
segments(h_miss$breaks[1], y0 = h_miss$counts[1], x1 = h_miss$breaks[2], col = "red")
for(i in 2:(length(h_obs$breaks)-1)) {
segments(x0 = h_obs$breaks[i], y0 = h_obs$counts[i-1], y1 = h_obs$counts[i], col = "blue")
segments(x0 = h_miss$breaks[i], y0 = h_miss$counts[i-1], y1 = h_miss$counts[i], col = "red")
segments(x0 = h_obs$breaks[i], y0 = h_obs$counts[i], x1 = h_obs$breaks[i+1], col = "blue")
segments(x0 = h_miss$breaks[i], y0 = h_miss$counts[i], x1 = h_miss$breaks[i+1], col = "red")
}
segments(x0 = h_obs$breaks[i+1], y0 = h_obs$counts[i], y1 = 0, col = "blue")
segments(x0 = h_miss$breaks[i+1], y0 = h_miss$counts[i], y1 = 0, col = "blue")
if(.MI_DEBUG) stopifnot(all(h_all$counts == (h_obs$counts + h_miss$counts)))
}
return(invisible(NULL))
})
setMethod("hist", signature(x = "semi-continuous"), def =
function(x, ...) {
con <- complete(x@indicator, 0L) == 0
y <- x@data[con]
NAs <- is.na(x)[con]
h_all <- hist(y, plot = FALSE)
plot(h_all, freq = TRUE, border = "lightgray", main = "", xlab = if(x@done) "Completed" else "Observed", axes = FALSE,
mgp = c(2, 1, 0), tcl = .05, col = if(x@done) "lightgray" else "blue", xlim = range(x@data, na.rm = TRUE), ...)
axis(1, lwd = 0)
axis(2)
if(x@done) {
h_obs <- hist(y[!NAs], breaks = h_all$breaks, plot = FALSE)
h_miss <- hist(y[NAs], breaks = h_all$breaks, plot = FALSE)
segments(h_obs$breaks[1], 0, y1 = h_obs$counts[1], col = "blue")
segments(h_miss$breaks[1], 0, y1 = h_miss$counts[1], col = "red")
segments(h_obs$breaks[1], y0 = h_obs$counts[1], x1 = h_obs$breaks[2], col = "blue")
segments(h_miss$breaks[1], y0 = h_miss$counts[1], x1 = h_miss$breaks[2], col = "red")
for(i in 2:(length(h_obs$breaks)-1)) {
segments(x0 = h_obs$breaks[i], y0 = h_obs$counts[i-1], y1 = h_obs$counts[i], col = "blue")
segments(x0 = h_miss$breaks[i], y0 = h_miss$counts[i-1], y1 = h_miss$counts[i], col = "red")
segments(x0 = h_obs$breaks[i], y0 = h_obs$counts[i], x1 = h_obs$breaks[i+1], col = "blue")
segments(x0 = h_miss$breaks[i], y0 = h_miss$counts[i], x1 = h_miss$breaks[i+1], col = "red")
}
segments(x0 = h_obs$breaks[i+1], y0 = h_obs$counts[i], y1 = 0, col = "blue")
segments(x0 = h_miss$breaks[i+1], y0 = h_miss$counts[i], y1 = 0, col = "blue")
NAs <- is.na(x)[!con]
tab <- table(x@data[!con], NAs)
for(i in 1:NROW(tab)) {
segments(x0 = as.numeric(rownames(tab)[i]), y0 = 0, y1 = sum(tab[i,]), col = "lightgray", lty = "dashed")
segments(x0 = as.numeric(rownames(tab)[i]), y0 = 0, y1 = tab[i,1], col = "blue", lty = "dashed")
if(ncol(tab) == 2) segments(x0 = as.numeric(rownames(tab)[i]), y0 = 0, y1 = tab[i,2], col = "red", lty = "dashed")
}
if(.MI_DEBUG) stopifnot(all(h_all$counts == (h_obs$counts + h_miss$counts)))
}
else {
tab <- table(x@data[!con])
for(i in 1:NCOL(tab)) segments(x0 = as.numeric(names(tab)[i]), y0 = 0, y1 = tab[i], col = "blue", lty = "dashed")
}
return(invisible(NULL))
})
setMethod("hist", signature(x = "categorical"), def =
function(x, ...) {
y <- x@data
values <- sort(unique(y))
breaks <- c(min(values) - 0.5, values + 0.5)
values <- unique(y)
values <- sort(values[!is.na(values)])
breaks <- c(sapply(values, FUN = function(x) c(x - .25, x + .25)))
NAs <- is.na(x)
h_all <- hist(y, breaks, plot = FALSE)
# h_all$counts[h_all$counts == 0] <- NA_integer_
plot(h_all, border = "lightgray", axes = FALSE, main = "", xlab = if(x@done) "Completed" else "Observed",
mgp = c(2, 1, 0), tcl = .05, col = if(x@done) "lightgray" else "blue", freq = TRUE, ylim = range(h_all$counts, na.rm = TRUE), ...)
axis(1, at = values, labels = levels(x@raw_data), lwd = 0)
axis(2)
if(x@done) {
h_obs <- hist(y[!NAs], breaks, plot = FALSE)
h_miss <- hist(y[NAs], breaks, plot = FALSE)
counts_obs <- h_obs$counts
counts_obs <- counts_obs
counts_miss <- h_miss$counts
counts_miss <- counts_miss
segments(breaks[1], 0, y1 = counts_obs[1], col = "blue")
segments(breaks[1], 0, y1 = counts_miss[1], col = "red")
if(counts_obs[1]) segments(breaks[1], y0 = counts_obs[1], x1 = breaks[2], col = "blue")
if(counts_miss[1]) segments(breaks[1], y0 = counts_miss[1], x1 = breaks[2], col = "red")
for(i in 2:(length(breaks)-1)) {
segments(x0 = breaks[i], y0 = counts_obs[i-1], y1 = counts_obs[i], col = "blue")
segments(x0 = breaks[i], y0 = counts_miss[i-1], y1 = counts_miss[i], col = "red")
if(counts_obs[i]) segments(x0 = breaks[i], y0 = counts_obs[i], x1 = breaks[i+1], col = "blue")
if(counts_miss[i]) segments(x0 = breaks[i], y0 = counts_miss[i], x1 = breaks[i+1], col = "red")
}
segments(x0 = breaks[i+1], y0 = counts_obs[i], y1 = 0, col = "blue")
segments(x0 = breaks[i+1], y0 = counts_miss[i], y1 = 0, col = "red")
if(.MI_DEBUG) stopifnot(all(h_all$counts == (h_obs$counts + h_miss$counts)))
}
return(invisible(NULL))
})
setMethod("hist", signature(x = "binary"), def =
function(x, ...) {
y <- x@data
if(max(y, na.rm = TRUE) > 1) y <- y - 1L
values <- 0:1
breaks <- c(-.5, .5, 1.5)
breaks <- c(-.25, .25, .75, 1.25)
NAs <- is.na(x)
h_all <- hist(y, breaks, plot = FALSE)
# h_all$counts[h_all$counts == 0] <- NA_integer_
plot(h_all, border = "lightgray", axes = FALSE, main = "", xlab = if(x@done) "Completed" else "Observed",
mgp = c(2, 1, 0), tcl = .05, col = if(x@done) "lightgray" else "blue", freq = TRUE, ylim = range(h_all$counts, na.rm = TRUE), ...)
axis(1, at = values, lwd = 0)
axis(2)
if(x@done) {
h_obs <- hist(y[!NAs], breaks, plot = FALSE)
h_miss <- hist(y[NAs], breaks, plot = FALSE)
counts_obs <- h_obs$counts
counts_obs <- counts_obs
counts_miss <- h_miss$counts
counts_miss <- counts_miss
segments(breaks[1], 0, y1 = counts_obs[1], col = "blue")
segments(breaks[1], 0, y1 = counts_miss[1], col = "red")
if(counts_obs[1]) segments(breaks[1], y0 = counts_obs[1], x1 = breaks[2], col = "blue")
if(counts_miss[1]) segments(breaks[1], y0 = counts_miss[1], x1 = breaks[2], col = "red")
for(i in 2:(length(breaks)-1)) {
segments(x0 = breaks[i], y0 = counts_obs[i-1], y1 = counts_obs[i], col = "blue")
segments(x0 = breaks[i], y0 = counts_miss[i-1], y1 = counts_miss[i], col = "red")
if(counts_obs[i]) segments(x0 = breaks[i], y0 = counts_obs[i], x1 = breaks[i+1], col = "blue")
if(counts_miss[i]) segments(x0 = breaks[i], y0 = counts_miss[i], x1 = breaks[i+1], col = "red")
}
segments(x0 = breaks[i+1], y0 = counts_obs[i], y1 = 0, col = "blue")
segments(x0 = breaks[i+1], y0 = counts_miss[i], y1 = 0, col = "red")
if(.MI_DEBUG) stopifnot(all(h_all$counts == (h_obs$counts + h_miss$counts)))
}
return(invisible(NULL))
})
setMethod("hist", signature(x = "missing_data.frame"), def =
function(x, ask = TRUE, ...) {
k <- sum(!x@no_missing)
if (.Device != "null device" && x@done) {
oldask <- grDevices::devAskNewPage(ask = ask)
if (!oldask) on.exit(grDevices::devAskNewPage(oldask), add = TRUE)
op <- options(device.ask.default = TRUE)
on.exit(options(op), add = TRUE)
}
par(mfrow = n2mfrow(k))
for(i in 1:x@DIM[2]) {
if(x@no_missing[i]) next
hist(x@variables[[i]])
header <- x@variables[[i]]@variable_name
if(is(x@variables[[i]], "continuous")) {
trans <- .show_helper(x@variables[[i]])$transformation[1]
header <- paste("\n", header, " (", trans, ")", sep = "")
}
title(main = header)
}
return(invisible(NULL))
})
setMethod("hist", signature(x = "mdf_list"), def =
function(x, ask = TRUE, ...) {
if (.Device != "null device") {
oldask <- grDevices::devAskNewPage(ask = ask)
if (!oldask) on.exit(grDevices::devAskNewPage(oldask), add = TRUE)
op <- options(device.ask.default = ask)
on.exit(options(op), add = TRUE)
}
sapply(x, FUN = hist, ...)
return(invisible(NULL))
})
setMethod("hist", signature(x = "mi"), def =
function(x, m = 1:length(x), ask = TRUE, ...) {
for(i in m) hist(x@data[[i]], ask = ask, ...)
return(invisible(NULL))
})
setMethod("hist", signature(x = "mi_list"), def =
function(x, m = 1:length(x), ask = TRUE, ...) {
if (.Device != "null device") {
oldask <- grDevices::devAskNewPage(ask = ask)
if (!oldask) on.exit(grDevices::devAskNewPage(oldask), add = TRUE)
op <- options(device.ask.default = ask)
on.exit(options(op), add = TRUE)
}
sapply(x, FUN = hist, m = m, ask = ask, ...)
return(invisible(NULL))
})
mi/R/get_parameters.R 0000644 0001762 0000144 00000005206 12513634171 014213 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
## these extract parameters from an estimated object
setMethod("get_parameters", signature(object = "ANY"), def =
function(object, ...) {
return(c(coef(object)))
})
setOldClass("polr")
setMethod("get_parameters", signature(object = "polr"), def =
function(object, ...) {
return(c(coef(object), object$zeta))
})
setOldClass("multinom")
setMethod("get_parameters", signature(object = "multinom"), def =
function(object, ...) {
return(c(t(coef(object))))
})
setMethod("get_parameters", signature(object = "missing_variable"), def =
function(object, latest = FALSE, ...) {
if(latest) {
if(is.logical(latest)) {
mark <- !apply(object@parameters, 1, FUN = function(x) any(is.na(x)))
mark <- mark[length(mark)]
}
else mark <- latest
return(object@parameters[mark,])
}
else return(object@parameters)
})
setMethod("get_parameters", signature(object = "missing_data.frame"), def =
function(object, latest = FALSE, ...) {
mini_list <- lapply(object@variables, get_parameters, latest = latest, ...)
out <- matrix(NA_real_, nrow(mini_list[[1]]), ncol = 0)
for(i in seq_along(mini_list)) out <- cbind(out, mini_list[[i]])
return(out)
})
setMethod("get_parameters", signature(object = "mi"), def =
function(object, latest = FALSE, ...) {
mini_list <- lapply(object@data, get_parameters, latest = latest, ...)
dims <- dim(mini_list)
out <- array(NA_real_, c(dims[1], length(mini_list), dims[2]),
dimnames = list(NULL, NULL, colnames(mini_list[[1]])))
for(i in 1:NCOL(out)) out[,i,] <- mini_list[[i]]
return(out)
})
setMethod("get_parameters", signature(object = "mi_list"), def =
function(object, latest = FALSE, ...) {
lapply(object, get_parameters, latest = latest, ...)
})
mi/R/random_df.R 0000644 0001762 0000144 00000057612 12513723170 013150 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011 Andrew Gelman
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
## Function to draw from the relevant symmetric generalized beta distribution
.rgbeta <-
function(num, shape) {
if(shape > 0) -1 + 2 * rbeta(num, shape, shape)
else if(shape == 0) -1 + 2 * rbinom(num, 1, 0.5)
else stop("shape must be non-negative")
}
## Function to draw a Cholesky factor of a random correlation matrix
## as a function of canonical partial correlations (CPCs)
.rcorvine <-
function(n_full, n_partial, n_cat, eta,
restrictions, strong, experiment, treatment_cor, last_CPC) {
nom <- !is.null(n_cat)
n <- n_full + 2 * n_partial
if(nom) {
n <- n + sum(n_cat) - length(n_cat)
nc <- 2 * n_partial + sum(n_cat) - length(n_cat)
holder <- matrix(NA_real_, nrow = nc, ncol = nc)
}
else holder <- matrix(NA_real_, nrow = 2 * n_partial, ncol = 2 * n_partial)
count <- 1
if(eta <= 0) stop("'eta' must be positive")
alpha <- eta + (n - 2) / 2 # if eta == 1, then tcrossprod(L) is uniform over correlation matrices
L <- matrix(NA_real_, n, n)
L[upper.tri(L)] <- 0
L[1,1] <- 1
L <- .rcorvine_helper(L, holder, n_full, n_partial, n_cat, alpha, restrictions,
strong, experiment, treatment_cor, nom, last_CPC)
if(restrictions == "MARish") L <- .MAR_opt(L, n_full, n_partial)
return(L)
}
.rcorvine_helper <-
function(L, holder, n_full, n_partial, n_cat, alpha, restrictions,
strong, experiment, treatment_cor, nom, last_CPC) {
n <- nrow(L)
mark <- is.na(L[,1])
sum_mark <- sum(mark)
CPCs <- .rgbeta(sum(mark), alpha)
count <- 1
if(experiment) {
len <- length(treatment_cor)
if(len == 1 && treatment_cor == 0) treatment_cor <- rep(0, sum_mark)
else if(len != sum_mark) {
stop(paste("length of 'treatment_cor' must be", sum_mark))
}
treatment_mark <- is.na(treatment_cor)
treatment_cor[treatment_mark] <- CPCs[treatment_mark]
CPCs <- treatment_cor # treatment variable is first
}
if(n_full == 0 && restrictions != "none") {
CPCs[sum_mark:(sum_mark - n_partial)] <- 0
holder[1,mark] <- CPCs
count <- count + 1
}
else if(n_full == 0) {
holder[1,mark] <- CPCs
count <- count + 1
}
L[mark,1] <- CPCs
W <- log(1 - CPCs^2)
## NOTE: order of variables is:
## all fully observed (with the treatment first if applicable)
## all partially observed but not nominal variables (if any)
## the components of the nominal variable(s) (if any)
## all missingness indicators
# fully observed variables have arbitrary CPCs
start <- 2
end <- n_full
if(n_full >= 2) for(i in start:end) {
L[i,i] <- exp(0.5 * W[i-1])
gap <- which(is.na(L[,i]))
gap1 <- gap - 1
alpha <- alpha - 0.5
CPCs <- .rgbeta(length(gap), alpha)
if(restrictions == "MCAR") CPCs[length(gap):(length(gap) - n_partial + 1)] <- 0
L[gap,i] <- CPCs * exp(0.5 * W[gap1])
W[gap1] <- W[gap1] + log(1 - CPCs^2)
}
# partially observed variables have arbitrary CPCs among themselves
# but are conditionally uncorrelated with all missingness indicators under MAR
# note: we condition on all fully observed variables and all previous partially observed variables
# this triangle scheme implies that the errors when predicting the partially observed variables are uncorrelated
if(n_full >= 2) start <- end + 1
end <- start + n_partial - 1
if(nom) end <- end - length(n_cat)
if(start <= end) for(i in start:end) {
L[i,i] <- exp(0.5 * W[i-1])
gap <- which(is.na(L[,i]))
gap1 <- gap - 1
alpha <- alpha - 0.5
CPCs <- .rgbeta(length(gap), alpha)
if(restrictions %in% c("triangular", "stratified")) {
CPCs[length(gap):(length(gap) - n_partial + 1)] <- 0
if(i == end && !is.na(last_CPC)) CPCs[length(CPCs)] <- last_CPC
}
else if(restrictions == "MCAR") CPCs[] <- 0
L[gap,i] <- CPCs * exp(0.5 * W[gap1])
W[gap1] <- W[gap1] + log(1 - CPCs^2)
holder[count,(count+1):ncol(holder)] <- CPCs
count <- count + 1
}
# if there are nominal partially observed variables, make the category residuals uncorrelated (MNL assumption)
if(nom) {
#if(n_full >= 2)
start <- end + 1
end <- start + sum(n_cat) - 1
for(i in start:end) {
L[i,i] <- exp(0.5 * W[i-1])
gap <- which(is.na(L[,i]))
gap1 <- gap - 1
alpha <- alpha - 0.5
if(restrictions != "none") CPCs <- rep(0, length(gap))
else {
CPCs <- .rgbeta(length(gap), alpha)
CPCs[-(length(gap):(length(gap) - n_partial + 1))] <- 0
}
L[gap,i] <- CPCs * exp(0.5 * W[gap1])
W[gap1] <- W[gap1] + log(1 - CPCs^2)
holder[count,(count+1):ncol(holder)] <- CPCs
count <- count + 1
}
}
# missingness indicators can be constructed to be instruments if MAR holds whose strength can be manipulated
if(n_partial > 1) {
start <- end + 1
end <- n - 1
count <- if(n_full > 0) 1 else 2
if(start <= end) for(i in start:end) {
L[i,i] <- exp(0.5 * W[i-1])
gap <- which(is.na(L[,i]))
gap1 <- gap - 1
alpha <- alpha - 0.5
if(restrictions %in% c("none", "MARish")) CPCs <- .rgbeta(length(gap), alpha)
else if(restrictions %in% c("stratified", "MCAR")) CPCs <- rep(0, length(gap))
else if(strong == 2) CPCs <- holder[count,(count+1):(count + length(gap))]
else if(strong == 1) CPCs <- .rgbeta(length(gap), alpha)
else if(strong == 0) CPCs <- rep(0, length(gap))
L[gap,i] <- CPCs * exp(0.5 * W[gap1])
W[gap1] <- W[gap1] + log(1 - CPCs^2)
}
}
L[n,n] <- exp(0.5 * W[n-1])
return(L)
}
## Function to draw a Cholesky factor of a random correlation matrix
## as a function of canonical partial correlations (CPCs)
.rcorvine_partial <-
function(Sigma, n_partial, n_cat, eta, restrictions, strong, experiment, treatment_cor) {
n <- nrow(Sigma)
n_full <- n - n_partial
ldlt <- LDLt(Sigma)
U <- t(ldlt$L)
holder <- matrix(NA_real_, n, n)
holder[1,-1] <- U[1,-1]
W <- c(NA_real_, 1 - holder[1,-1]^2)
for(i in 2:(n-1)) {
denominator <- W[i]
gap <- (i+1):n
temp <- U[i,gap] / sqrt(W[gap] / denominator)
invalid <- is.na(temp)
temp[invalid] <- sign(U[i,gap][invalid])
invalid <- abs(temp) > 1
temp[invalid] <- sign(temp[invalid])
holder[i,gap] <- temp
W[gap] <- W[gap] * (1 - holder[i,gap]^2)
}
nom <- !is.null(n_cat)
n <- n + n_partial
if(nom) {
n <- n + sum(n_cat) - length(n_cat)
}
L <- t(U) * sqrt(diag(ldlt$D))
if(eta <= 0) stop("'eta' must be positive")
diff <- n - nrow(L)
alpha <- eta + diff / 2
L <- cbind(L, matrix(0, nrow(L), diff))
L <- rbind(L, matrix(NA_real_, diff, n))
L[upper.tri(L)] <- 0
holder <- matrix(NA_real_, nrow = diff, ncol = ncol(L))
L <- .rcorvine_helper(L, holder, n_full, n_partial, n_cat, alpha, restrictions, strong, experiment, treatment_cor, nom)
if(restrictions == "MARish") L <- .MAR_opt(L, n_full, n_partial)
return(L)
}
.MAR_opt <-
function(L, n_full, n_partial) {
n_p2 <- n_partial^2
lowers <- lower.tri(L)
cell_mark <- tail(which(lowers), n_p2)
lowers[] <- FALSE
lowers[cell_mark] <- TRUE
row_mark <- which(apply(lowers, 1, any))
diag(L)[row_mark] <- NA_real_
partials <- (n_full + 1):(n_full + n_partial)
missingness <- nrow(L):(nrow(L) - n_partial + 1)
block_mark <- which( row(L) %in% partials & col(L) %in% missingness )
foo <- function(theta) {
L[cell_mark] <- theta
diags <- 1 - rowSums(L[row_mark,,drop=FALSE]^2, na.rm = TRUE)
if(any(diags < 0)) return(NA_real_)
diag(L)[row_mark] <- sqrt(diags)
Sigma_inv <- chol2inv(t(L))
return(c(crossprod(Sigma_inv[block_mark])))
}
opt <- optim(L[cell_mark], foo, method = "BFGS")
L[cell_mark] <- opt$par
diag(L)[row_mark] <- sqrt(1 - rowSums(L[row_mark,,drop=FALSE]^2, na.rm = TRUE))
return(L)
}
.NMARness <-
function(L) {
xs <- grep("^x_", rownames(L), value = TRUE)
ys <- grep("^y_", rownames(L), value = TRUE)
us <- grep("^u_", rownames(L), value = TRUE)
sapply(ys, FUN = function(y) {
i <- y
sapply(us, FUN = function(u) {
j <- u
cons <- c(xs, setdiff(us, u))
mark <- c(i,j,cons)
D_ijcons <- det(tcrossprod(L[mark,,drop = FALSE]))
mark <- cons
D_cons <- det(tcrossprod(L[mark,,drop = FALSE]))
mark <- c(i,cons)
D_icons <- det(tcrossprod(L[mark,,drop = FALSE]))
mark <- c(j,cons)
D_jcons <- det(tcrossprod(L[mark,,drop = FALSE]))
return(1 - D_ijcons * D_cons / (D_icons * D_jcons))
})
})
}
## Function to construct a random data.frame with tunable missingness
rdata.frame <-
function(N = 1000, restrictions = c("none", "MARish", "triangular", "stratified", "MCAR"),
last_CPC = NA_real_, strong = FALSE, pr_miss = .25,
Sigma = NULL, alpha = NULL,
experiment = FALSE, treatment_cor = c(rep(0, n_full - 1), rep(NA, 2 * n_partial)),
n_full = 1, n_partial = 1, n_cat = NULL,
eta = 1, df = Inf, types = "continuous", estimate_CPCs = TRUE) {
if(length(N) != 1) stop("length of 'N' must be 1")
if(N <= 0) stop("'N' must be positive")
restrictions <- match.arg(restrictions)
if(strong && restrictions == "none") warning("instruments are not valid unless the MAR assumption is enforced")
if(n_full < 0) stop("'n_full' must be >= 0")
if(n_partial < 0) stop("'n_partial must be >= 0")
n <- n_partial + n_full
if(n == 0) stop("at least one of 'n_full' or 'n_partial' must be positive")
if(length(pr_miss) == 1) pr_miss <- rep(pr_miss, n_partial)
if(any(pr_miss <= 0)) stop("all elements of 'pr_miss' must be > 0")
if(any(pr_miss >= 1)) stop("all elements of 'pr_miss' must be < 1")
if(length(df) != 1) stop("'df' must be of length 1")
if(df <= 0) stop("'df' must be a positive")
if(length(types) == 1) types <- rep(types, n)
types <- match.arg(types, c("continuous", "count", "binary", "treatment", "ordinal", "nominal", "proportion", "positive"),
several.ok = TRUE)
if(any(types[1:n_full] == "nominal")) {
warning("fully observed nominal variables not supported, changing them to ordinal without loss of generality")
types <- ifelse(types == "nominal" & 1:length(types) <= n_full, "ordinal", types)
}
# else if(!is.null(n_cat)) types[n:(n - length(n_cat) + 1)] <- "nominal"
if(all( c("ordinal", "nominal") %in% types[-(1:n_full)] )) {
stop("including both ordinal and nominal partially observed variables is not supported yet")
}
if(any(types == "nominal")) {
has_nominal <- TRUE
if(is.null(n_cat)) {
if(types[n] != "nominal") {
warning("assuming the last partially observed variable is nominal with 3 categories")
types[n] <- "nominal"
}
n_cat <- 3
}
}
else has_nominal <- FALSE
if(has_nominal) {
if(any(n_cat < 3)) stop("nominal variables must have more than 2 categories")
types <- c(types[types != "nominal"], types[types == "nominal"])
}
if(experiment) {
if(types[1] != "treatment") stop("the first variable must be the treatment variable")
if(any(types[-1] == "treatment")) stop("only one treatment variable is permitted")
}
if(is.null(Sigma)) L <- .rcorvine(n_full, n_partial,
if(has_nominal) n_cat else NULL,
eta, restrictions, strong, experiment, treatment_cor, last_CPC)
else {
if(!isSymmetric(Sigma)) stop("'Sigma' must be symmetric")
if(ncol(Sigma) != (n_full + 2 * n_partial)) stop("'Sigma' must be of order 'n_full + 2 * n_partial'")
if(any(types == "nominal")) stop("nominal variables not supported when 'Sigma' is given")
if(experiment) stop("treatment variables not supported when 'Sigma' is given")
L <- chol(Sigma)
}
if(is.null(alpha)) {
Z <- matrix(rnorm(N * nrow(L)), nrow = nrow(L))
X <- as.data.frame(t(Z) %*% t(L))
}
else {
if(length(alpha) == 1 && is.na(alpha)) alpha <- rt(ncol(L), df)
else if(length(alpha) != ncol(L)) stop(paste("length of alpha must be", ncol(L)))
Sigma <- tcrossprod(L)
result <- find_Omega(Sigma, alpha, control = list(maxit = 1000))
X <- as.data.frame(sn::rmsn(N, Omega = result$Omega, alpha = alpha))
}
if(df < Inf) X <- X / sqrt(rchisq(N, df) / df)
if(!has_nominal) colnames(X) <- c(if(n_full) paste("x", 1:n_full, sep = "_"),
if(n_partial) paste("y", 1:n_partial, sep = "_"),
if(n_partial) paste("u", 1:n_partial, sep = "_") )
else {
if(length(n_cat) > 23) stop("number of nominal variables must be <= 23")
cn <- as.character(NULL)
for(i in seq_along(n_cat)) cn <- c(cn, paste(letters[i], 1:n_cat[i], sep = "_"))
colnames(X) <- c(if(n_full) paste("x", 1:n_full, sep = "_"), if(n_partial > length(n_cat))
paste("y", 1:(n_partial - length(n_cat)), sep = "_") else NULL,
cn,
paste("u", 1:n_partial, sep = "_") )
}
if(experiment) {
row_mark <- X[,1] == 1
col_mark <- c(FALSE, is.na(treatment_cor))
col_mark[grepl("^u_", colnames(X))] <- FALSE
if(any(col_mark)) X[row_mark,col_mark] <- X[row_mark,col_mark] + 1 # ATT
}
X_obs <- X
correlations <- rep(NA_real_, if(!has_nominal) n_partial else n_partial - length(n_cat) + sum(n_cat))
end <- n_partial - length(n_cat) * has_nominal
if(end > 0) for(i in 1:end) {
y_var <- paste("y", i, sep = "_")
u_var <- paste("u", i, sep = "_")
X_obs[X[,u_var] < quantile(X[,u_var], probs = pr_miss[i]), y_var] <- NA_real_
X_obs[[u_var]] <- NULL
if(!estimate_CPCs) next
f_miss <- colnames(X)
if(n_full > 0) f_miss <- f_miss[1:(n_full + i - 1)]
else f_miss <- "1"
f_miss <- paste(f_miss, collapse = " + ")
f_miss <- as.formula(paste(u_var, "~", f_miss))
ols_u <- lm(f_miss, data = X)
f_true <- colnames(X)
if(n_full > 0) f_true <- f_true[1:(n_full + i - 1)]
else f_true <- "1"
f_true <- paste(f_true, collapse = " + ")
f_true <- as.formula(paste(y_var, "~", f_true))
ols_y <- lm(f_true, data = X)
correlations[i] <- cor(residuals(ols_u), residuals(ols_y)) # this differs only randomly from 0 under MAR due to finite N
}
letter_mark <- 1
if(has_nominal) for(i in (end + 1):n_partial) {
y_var <- paste("y", i, sep = "_")
u_var <- paste("u", i, sep = "_")
mark <- grepl(paste("^", letters[letter_mark], "_", sep = ""), colnames(X))
lev <- as.character(NULL)
for(j in 1:ceiling(n_cat[letter_mark] / 26)) lev <- c(lev, rep(letters, each = j))
lev <- lev[1:n_cat[letter_mark]]
X_obs[[y_var]] <- X[[y_var]] <- factor(max.col(X[,mark]), labels = lev)
X_obs[X[,u_var] < quantile(X[,u_var], probs = pr_miss[i]), y_var] <- NA
if(!estimate_CPCs) {
letter_mark <- letter_mark + 1
next
}
f_miss <- colnames(X)
if(letter_mark == 1) f_miss <- f_miss[1:(n_full + n_partial - length(n_cat))]
else f_miss <- f_miss[1:(n_full + n_partial - length(n_cat) + sum(n_cat[1:(letter_mark - 1)]))]
f_miss <- paste(f_miss, collapse = " + ")
f_miss <- as.formula(paste(u_var, "~", f_miss))
ols_u <- lm(f_miss, data = X)
for(j in 1:n_cat[letter_mark]) {
f_true <- colnames(X)
if(letter_mark == 1) f_true <- f_true[1:(n_full + n_partial - length(n_cat))]
else f_true <- f_true[1:(n_full + n_partial - length(n_cat) + sum(n_cat[1:(letter_mark - 1)]))]
f_true <- paste(f_true, collapse = " + ")
n_var <- paste(letters[letter_mark], j, sep = "_")
f_true <- as.formula(paste(n_var, "~", f_true))
ols_n <- lm(f_true, data = X)
correlations[which(is.na(correlations))[1]] <- cor(residuals(ols_u), residuals(ols_n)) # this differs only randomly from 0 under MAR
}
letter_mark <- letter_mark + 1
}
if(!has_nominal) names(correlations) <- if(n_partial) paste("e", 1:n_partial, sep = "_") else NULL
else {
cn <- if(n_partial > length(n_cat)) paste("e", 1:(n_partial - length(n_cat)), sep = "_") else as.character(NULL)
for(i in seq_along(n_cat)) cn <- c(cn, paste("e:", letters[i], "_", 1:n_cat[i], sep = ""))
names(correlations) <- cn
}
X_obs <- X_obs[,grepl("^[xy]_", colnames(X_obs))]
mark_ord <- 1
for(i in seq_along(types)) {
mark <- is.na(X_obs[,i])
if(types[i] %in% c("binary", "treatment")) {
if(i == 1 && experiment) {
X_obs[,i] <- X[,i] <- as.factor(X[,i] > 0)
colnames(X_obs)[1] <- colnames(X)[1] <- "treatment"
}
else {
X[[toupper(colnames(X)[i])]] <- X[,i]
X_obs[,i] <- X[,i] <- cut(X[,i], breaks = 2, labels = c("FALSE", "TRUE"))
}
}
else if(types[i] == "ordinal") {
X[[toupper(colnames(X)[i])]] <- X[,i]
breaks <- 3
if(length(n_cat) == 1) breaks <- n_cat
else if(length(n_cat) > 1) {
breaks <- n_cat[mark_ord]
mark_ord <- mark_ord + 1
}
qs <- quantile(X[,i], prob = seq(from = 0, to = 1, length.out = breaks + 1))
qs[1] <- -Inf
qs[length(qs)] <- Inf
X_obs[,i] <- X[,i] <- cut(X[,i], breaks = qs, ordered_result = TRUE, labels = LETTERS[1:breaks])
}
else if(types[i] == "count") { # this is not quite consistent with the DGP
X[[toupper(colnames(X)[i])]] <- X[,i]
X_obs[,i] <- X[,i] <- as.integer(qpois(pt(X[,i], df = df), lambda = 5))
}
else if(types[i] == "proportion") { # this is not quite consistent with the DGP
X[[toupper(colnames(X)[i])]] <- X[,i]
X_obs[,i] <- X[,i] <- pt(X[,i], df = df)
}
else if(types[i] == "positive") {
X[[toupper(colnames(X)[i])]] <- X[,i]
X_obs[,i] <- X[,i] <- exp(X[,i])
}
X_obs[mark,i] <- NA
}
ord <- c(colnames(X_obs), grep("^u_", colnames(X), value = TRUE))
extras <- colnames(X)
extras <- extras[!(extras %in% ord)]
ord <- c(ord, extras)
X <- X[,ord]
cn <- colnames(X)
cn <- cn[sapply(1:ncol(X), FUN = function(i) {
!is.factor(X[,i]) && !(toupper(cn[i]) %in% cn[-i])
})]
resort <- function(s) {
ord <- order(as.integer(gsub("^[a-z,A-Z]_", "", s)))
return(s[ord])
}
cn <- c(if(experiment) "treatment_propensity",
resort(grep("^x", cn, ignore.case = TRUE, value = TRUE)),
resort(grep("^y", cn, ignore.case = TRUE, value = TRUE)),
grep("^[a-t]_", cn, ignore.case = FALSE, value = TRUE),
grep("^u", cn, ignore.case = FALSE, value = TRUE))
rownames(L) <- colnames(L) <- cn
out <- list(true = X, obs = X_obs, empirical_CPCs = correlations, L = L)
if(!is.null(alpha)) out <- c(out, list(alpha = alpha, skewness = result$sn_skewness, kurtosis = result$sn_kurtosis))
return(out)
}
## this function makes a positive definite correlation matrix given choose(n,2) unbounded parameters
make_O.cor <-
function(theta) {
n <- (1 + sqrt(1 + 8 * length(theta))) / 2
CPCs <- exp(2 * theta)
CPCs <- (CPCs - 1) / (CPCs + 1)
L <- matrix(0, n, n)
L[1,1] <- 1
start <- 1
end <- n - 1
L[-1,1] <- partials <- CPCs[start:end]
W <- log(1 - partials^2)
for(i in 2:(n-1)) {
start <- end + 1
end <- start + n - i - 1
gap <- (i+1):n
gap1 <- i:(n-1)
partials <- CPCs[start:end]
L[i,i] <- exp(0.5 * W[i-1])
L[gap,i] <- partials * exp(0.5 * W[gap1])
W[gap1] <- W[gap1] + log(1 - partials^2)
}
L[n,n] <- exp(0.5 * W[n-1])
return(tcrossprod(L))
}
## this objective function is the Frobenius norm of the difference between Sigma and Sigma_proposed
fmin <-
function(theta, Sigma, alpha, final = FALSE, ...) {
n <- nrow(Sigma)
omega <- exp(theta[1:n]) # standard deviations of the implicit Omega matrix
O.cor <- make_O.cor(theta[-(1:n)])
alphaTO.cor <- alpha %*% O.cor
Sigma_proposed <- ( O.cor - 2 / (pi * c(1 + alphaTO.cor %*% alpha)) * crossprod(alphaTO.cor) ) * tcrossprod(omega)
if(final) return(Sigma_proposed)
return(crossprod( c(Sigma - Sigma_proposed) )[1])
}
## this function makes a 3-factor Cholesky factorization of a PSD A matrix
LDLt <-
function(A) {
n <- nrow(A)
L <- diag(n)
D <- matrix(0, n, n)
for(j in 1:n) {
s <- 0
if(j > 1) for(k in 1:(j-1)) s <- s + L[j,k]^2 * D[k,k]
D[j,j] <- A[j,j] - s
if(D[j,j] < 1e-15) {
D[j,j] <- 0
break
}
if(j < n) for(i in (j+1):n) {
s <- 0
if(j > 1) for(k in 1:(j-1)) s <- s + L[i,k] * L[j,k] * D[k,k]
L[i,j] <- (A[i,j] - s) / D[j,j]
}
}
return(list(L = L, D = D))
}
## this function makes plausible starting values (basically treating alpha is if it were a zero vector)
make_start <-
function(Sigma) {
log_omega <- log(sqrt(diag(Sigma)))
Sigma <- cov2cor(Sigma)
n <- nrow(Sigma)
U <- t(LDLt(Sigma)$L)
holder <- matrix(NA_real_, n, n)
holder[1,-1] <- U[1,-1]
W <- c(NA_real_, 1 - holder[1,-1]^2)
for(i in 2:(n-1)) {
denominator <- W[i]
gap <- (i+1):n
temp <- U[i,gap] / sqrt(W[gap] / denominator)
invalid <- is.na(temp)
temp[invalid] <- sign(U[i,gap][invalid])
invalid <- abs(temp) > 1
temp[invalid] <- sign(temp[invalid])
holder[i,gap] <- temp
W[gap] <- W[gap] * (1 - holder[i,gap]^2)
}
holder <- t(holder)
CPCs <- holder[lower.tri(holder)]
return(c(log_omega, atanh(CPCs)))
}
## this function finds Omega via optim() and returns it as part of a list with
find_Omega <-
function(Sigma, alpha, method = "BFGS", start = make_start(Sigma), ...) {
stopifnot(isSymmetric(Sigma)) # Sigma is the intended covariance matrix of the multivariate skew-normal variable
stopifnot(all(eigen(Sigma, TRUE, TRUE)$values > 0))
n <- nrow(Sigma)
alpha <- c(alpha)
stopifnot(length(alpha) == n) # alpha is a shape parameter for the multivariate skew-normal variable
opt <- optim(start, fmin, method = method, Sigma = Sigma, alpha = alpha, ...)
if(opt$convergence != 0) {
gradients <- opt$counts["gradient"]
warning(paste("Convergence problem. Pass something like 'control = list(maxit = ",
5 * gradients, ")' if alpha is far from a zero vector", sep = ""))
}
theta <- opt$par
omega <- exp(theta[1:n])
O.cor <- make_O.cor(theta[-(1:n)])
opt$Omega <- O.cor * tcrossprod(omega)
alphaTO.cor <- c(alpha %*% O.cor)
delta <- c( (O.cor %*% alpha) / sqrt(1 + alphaTO.cor %*% alpha)[1] )
mu_z <- sqrt(2/pi) * delta
num <- c( mu_z %*% chol2inv(chol(O.cor)) %*% mu_z )
opt$delta <- delta
opt$sn_skewness <- ( (4 - pi) / 4 )^2 * ( num / (1 - num) )^3
opt$sn_kurtosis <- 2 * (pi - 3) * ( num / (1 - num) )^2
return(opt)
}
mi/R/AllClass.R 0000644 0001762 0000144 00000157645 12513727705 012734 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
# Copyright (C) 1995-2012 The R Core Team
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
## NOTE: If you change something here, also update the UML graph thingie
setClassUnion("MatrixTypeThing", c("matrix"))
setOldClass("family")
suppressWarnings(setClassUnion("WeAreFamily", c("family", "character"))) # arm + lme4 = warnings
setOldClass("mi_list")
setOldClass("mdf_list")
.known_imputation_methods <- c("ppd", "pmm", "mean", "median", "expectation", "mode", "mcar", NA_character_)
.known_families <- c("binomial", "gaussian", "Gamma", "inverse.gaussian", "poisson",
"quasibinomial", "quasipoisson") # "quasi" is not supported at the moment (FIXME)
.known_links <- c("logit", "probit", "cauchit", "log", "cloglog", # for binomial()
"identity", "inverse", # for gaussian() plus "log",
# "inverse", "identity", "log", # for Gamma()
"sqrt", # for poisson() plus "log", and "identity",
"1/mu^2") # for inverse.gaussian() plus "inverse", "identity" and "log"
# An important class in library(mi) is the missing_variable class, which is a virtual class
# for a variable that may (or may not) have missingness. The usual types of variables that
# we are interested in imputing all inherit (perhaps indirectly) from the missing_variable
# superclass, e.g. continuous, binary, etc. In principle, these class definitions should
# provide ALL the necessary information for that variable, like the extent of its missingness
# and how the missing values will be (or have been) imputed. Thus, in principle, it should
# be possible to tweak the behavior of library(mi) simply by 1) creating a new class that
# inherits from the relevant existing class, 2) writing methods for the mi() and fit_model()
# generics and 3) perhaps a few other things that you will have to discover on your own.
## missing_variable is a virtual class for a variable that may (or may not) have missingness
setClass("missing_variable",
representation(
variable_name = "character", # name of the variable but do not rely on for anything important
raw_data = "ANY", ## DO NOT EVER CHANGE THE VALUES OF THIS SLOT
data = "ANY", ## Copy the raw_data into data and modify data as necessary
n_total = "integer", # total number of potential datapoints, i.e. length of raw_data
all_obs = "logical", # are ALL datapoints actually observed, i.e. not missing?
n_obs = "integer", # number of observed datapoints
which_obs = "integer", # which datapoints are observed
all_miss = "logical", # are ALL datapoints missing, only true for latent variables
n_miss = "integer", # number of missing datapoints in the data slot (originally)
which_miss = "integer", # which datapoints are missing in the data slot
n_extra = "integer", # number of extra datapoints added (as missing)
which_extra = "integer", # which datapoints are extras
n_unpossible = "integer", # number of datapoints for which the variable could not be observed
which_unpossible = "integer",# which datapoints could not be observed
n_drawn = "integer", # number of datapoints to impute
which_drawn = "integer", # which datapoints are imputed
imputation_method = "character", # how to impute them
family = "WeAreFamily", # see help(family)
known_families = "character",# families listed on help(family) plus multinomial()
known_links = "character", # see help(family)
imputations = "MatrixTypeThing", # iterations x n_drawn matrix of imputation history
done = "logical", # are we finished imputing?
parameters = "MatrixTypeThing", # history of estimated parameters in modeling this variable
model = "ANY", # last model fit
fitted = "ANY", # last fitted values
"VIRTUAL"),
prototype(
variable_name = NA_character_,
imputations = matrix(NA_real_, 0, 0),
parameters = matrix(NA_real_, 0, 0),
imputation_method = .known_imputation_methods,
family = NA_character_,
known_families = .known_families,
known_links = .known_links
),
validity = function(object) {
out <- TRUE
l <- length(object@raw_data)
if(l == 0) return(out)
if(sum(-object@n_total, object@n_obs, object@n_miss, object@n_extra, object@n_unpossible, na.rm = TRUE)) {
out <- paste(object@variable_name,
": slots 'n_obs', 'n_miss', 'n_extra', and 'n_unpossible' must sum to 'n_total'")
}
else if(!(length(object@which_obs) %in% c(0:1, object@n_obs))) {
out <- paste(object@variable_name, ": 'n_obs' must equal the length of 'which_obs'")
}
else if(!(length(object@which_miss) %in% c(0:1, object@n_miss))) {
out <- paste(object@variable_name, ": 'n_miss' must equal the length of 'which_miss'")
}
else if(!(length(object@which_extra) %in% c(0:1, object@n_extra))) {
out <- paste(object@variable_name, ": 'n_extra' must equal the length of 'which_extra'")
}
else if(!(length(object@which_extra) %in% c(0:1, object@n_unpossible))) {
out <- paste(object@variable_name, ": 'n_unpossible' must equal the length of 'which_unpossible'")
}
else if(sum(object@n_obs)) {
temp <- sort(c(object@which_obs, object@which_miss, object@which_extra, object@which_unpossible))
names(temp) <- NULL
if(!identical(1:object@n_total, temp)) {
out <- paste(object@variable_name, ": ''which_*' slots must be mutually exclusive and exhaustive")
}
}
for(i in slotNames(object)) {
if(i %in% c("raw_data", "data", "which_obs", "which_miss", "which_extra",
"which_unpossible", "which_drawn", "imputation_method", "known_transformations",
"family", "known_families", "known_links", "levels", "cutpoints")) next
if((l <- length(slot(object, i))) > 1) {
out <- paste(object@variable_name, ": length of", i, "must be 0 or 1 but is", l)
break
}
}
return(out)
}
)
## this initialize() method gets called for everything that inherits from missing_variable
## but can be modified by a subsequently-called initialize() method
setMethod("initialize", "missing_variable", def =
function(.Object, NA.strings = c("", ".", "Na", "N/a", "N / a", "NaN",
"Not Applicable", "Not applicable",
"Not Available", "Not available",
"Not Ascertained", "Not ascertained",
"Unavailable", "Unknown", "Missing",
"Dk", "Don't Know", "Don't know", "Do Not Know", "Do not know"), ...) {
.Object <- callNextMethod()
if(length(.Object@raw_data) == 0) return(.Object)
if(length(.Object@data) == 0) { # copy raw_data into data
.Object@data <- .Object@raw_data
names(.Object@data) <- .Object@variable_name
}
# bookkeeping
infinites <- is.infinite(.Object@raw_data)
if(any(infinites)) {
warning(paste(.Object@variable_name, ": some observations are infinite, changing to NA"))
.Object@data[infinites] <- NA
}
nans <- is.nan(.Object@raw_data)
if(any(nans)) {
warning(paste(.Object@variable_name, ": some observations are NaN, changing to NA"))
.Object@data[nans] <- NA
}
NA.strings <- unique(c(NA.strings, toupper(NA.strings), tolower(NA.strings)))
if(!is.numeric(.Object@raw_data)) for(i in seq_along(NA.strings)) {
mark <- .Object@raw_data == NA.strings[i]
if(any(mark, na.rm = TRUE)) {
warning(paste(.Object@variable_name, ": some observations", NA.strings[i], "changing to NA"))
.Object@data[mark] <- NA
}
}
NAs <- which(is.na(.Object@data))
if(length(NAs)) .Object@imputation_method <- "ppd" else .Object@imputation_method <- NA_character_
.Object@n_miss <- length(NAs)
.Object@which_miss <- NAs
notNAs <- which(!is.na(.Object@data))
.Object@n_obs <- length(notNAs)
.Object@which_obs <- notNAs
.Object@n_total <- length(NAs) + length(notNAs)
.Object@all_miss <- length(notNAs) == 0
.Object@all_obs <- length(NAs) == 0
if(!length(.Object@n_extra)) .Object@n_extra <- 0L
if(!length(.Object@n_unpossible)) .Object@n_unpossible <- 0L
.Object@n_drawn <- .Object@n_miss + .Object@n_extra
.Object@which_drawn <- c(.Object@which_miss, .Object@which_extra)
.Object@done <- FALSE
return(.Object)
})
setClass("irrelevant", representation("missing_variable"),
# prototype(
# imputation_method = NA_character_,
# family = NA_character_)
)
## a constant variable that has no missingness (and very few methods)
setClass("fixed", representation("irrelevant"),
validity = function(object) {
out <- TRUE
vals <- unique(object@raw_data)
vals <- vals[!is.na(vals)]
if(sum(object@n_miss)) {
out <- paste(object@variable_name, ": fixed variables cannot have missingness")
}
else if(length(vals) > 1) {
out <- paste(object@variable_name, ": purportedly 'fixed' variables cannot have multiple unique values")
}
return(out)
}
)
setClass("group", representation("irrelevant"))
## virtual class for categorical variables, which may be unordered, ordered, binary, or interval
setClass("categorical",
representation(
"missing_variable",
levels = "character",
"VIRTUAL"),
prototype(
known_families = c("multinomial", "binomial", "gaussian")
)
)
setMethod("initialize", "categorical", def =
function(.Object, ...) {
.Object <- callNextMethod()
l <- length(.Object@raw_data)
if(l == 0) return(.Object)
## FIXME: check on the unused levels thing
# .Object@raw_data <- factor(.Object@raw_data)
lev <- levels(factor(.Object@raw_data))
# dummies <- t(sapply(.Object@raw_data, FUN = function(x) as.integer(x == lev)))[,-1, drop = FALSE]
# if(ncol(dummies) == 1) colnames(dummies) <- .Object@variable_name
# else colnames(dummies) <- lev[-1]
# mark <- !apply(dummies, 2, FUN = function(x) all(x == 0, na.rm = TRUE))
# dummies <- dummies[,mark, drop = FALSE]
# lev <- c(lev[1], lev[-1][mark])
.Object@levels <- lev
.Object@data <- as.integer(factor(.Object@raw_data))
return(.Object)
})
## this is a hacked version of binomial()
multinomial <- function (link = "logit")
{
linktemp <- substitute(link)
if (!is.character(linktemp)) {
linktemp <- deparse(linktemp)
if (linktemp == "link") {
warning("use of multinomial(link=link) is deprecated\n",
domain = NA)
linktemp <- eval(link)
if (!is.character(linktemp) || length(linktemp) !=
1L)
stop("'link' is invalid", domain = NA)
}
}
okLinks <- c("logit", "probit", "cloglog", "cauchit", "log")
if (linktemp %in% okLinks)
stats <- make.link(linktemp)
else if (is.character(link)) {
stats <- make.link(link)
linktemp <- link
}
else {
if (inherits(link, "link-glm")) {
stats <- link
if (!is.null(stats$name))
linktemp <- stats$name
}
else {
stop(gettextf("link \"%s\" not available for multinomial family; available links are %s",
linktemp, paste(sQuote(okLinks), collapse = ", ")),
domain = NA)
}
}
variance <- function(mu) mu * (1 - mu)
validmu <- function(mu) all(mu > 0) && all(mu < 1)
dev.resids <- binomial()$dev.resids
aic <- function(y, n, mu, wt, dev) {
m <- if (any(n > 1))
n
else wt
-2 * sum(ifelse(m > 0, (wt/m), 0) * dbinom(round(m *
y), round(m), mu, log = TRUE))
}
initialize <- expression({
if (NCOL(y) == 1) {
if (is.factor(y)) y <- y != levels(y)[1L]
n <- rep.int(1, nobs)
y[weights == 0] <- 0
if (any(y < 0 | y > 1)) stop("y values must be 0 <= y <= 1")
mustart <- (weights * y + 0.5)/(weights + 1)
m <- weights * y
if (any(abs(m - round(m)) > 0.001)) warning("non-integer #successes in a multinomial glm!")
} else if (NCOL(y) == 2) {
if (any(abs(y - round(y)) > 0.001)) warning("non-integer counts in a multinomial glm!")
n <- y[, 1] + y[, 2]
y <- ifelse(n == 0, 0, y[, 1]/n)
weights <- weights * n
mustart <- (n * y + 0.5)/(n + 1)
} else stop("for the multinomial family, y must be a vector of 0 and 1's\n",
"or a 2 column matrix where col 1 is no. successes and col 2 is no. failures")
})
simfun <- function(object, nsim) {
ftd <- fitted(object)
n <- length(ftd)
ntot <- n * nsim
wts <- object$prior.weights
if (any(wts%%1 != 0))
stop("cannot simulate from non-integer prior.weights")
if (!is.null(m <- object$model)) {
y <- model.response(m)
if (is.factor(y)) {
yy <- factor(1 + rbinom(ntot, size = 1, prob = ftd),
labels = levels(y))
split(yy, rep(seq_len(nsim), each = n))
}
else if (is.matrix(y) && ncol(y) == 2) {
yy <- vector("list", nsim)
for (i in seq_len(nsim)) {
Y <- rbinom(n, size = wts, prob = ftd)
YY <- cbind(Y, wts - Y)
colnames(YY) <- colnames(y)
yy[[i]] <- YY
}
yy
}
else rbinom(ntot, size = wts, prob = ftd)/wts
}
else rbinom(ntot, size = wts, prob = ftd)/wts
}
structure(list(family = "multinomial", link = linktemp, linkfun = stats$linkfun,
linkinv = stats$linkinv, variance = variance, dev.resids = dev.resids,
aic = aic, mu.eta = stats$mu.eta, initialize = initialize,
validmu = validmu, valideta = stats$valideta, simulate = simfun),
class = "family")
}
## unordered categorical, which corresponds to an unordered factor with more than 2 levels
setClass("unordered-categorical", representation("categorical",
estimator = "character",
use_NA = "logical",
rank = "integer"),
prototype(
estimator = "MNL",
imputation_method = c("ppd", "pmm", "mode", "mcar", NA_character_),
family = multinomial(link = "logit"),
known_families = c("multinomial", "binomial"),
known_links = c("logit", "probit", "cauchit", "log", "cloglog"),
use_NA = FALSE,
rank = NA_integer_
),
validity = function(object) {
out <- TRUE
values <- unique(object@raw_data)
values <- values[!is.na(values)]
im <- getClass(class(object))@prototype@imputation_method
if(length(values) > 0 && length(values) <= 2) {
out <- paste(object@variable_name, "unordered-categoricals must have more than 2 levels; otherwise use binary")
}
else if(!all(object@imputation_method %in% im)) {
out <- paste(object@variable_name, ": 'imputation_method' must be one of:\n", paste(im, collapse = ", "))
}
# else if(object@family$family != "multinomial") {
# out <- "the 'family' slot of 'unordered-categorial' class must be 'multinomial(link = 'logit')'"
# }
# else if(object@family$link != "logit") {
# out <- "the 'family' slot of 'unordered-categorial' class must be 'multinomial(link = 'logit')'"
# }
else if(!(object@estimator %in% c("MNL", "RNL"))) {
out <- paste(object@variable_name, ": estimator not recognized")
}
else if(!(object@use_NA %in% c(TRUE, FALSE))) {
out <- paste(object@variable_name, ": use_NA must be TRUE or FALSE")
}
return(out)
}
)
## ordered categorical, which corresponds to an ordered factor
setClass("ordered-categorical", representation("categorical",
cutpoints = "numeric"),
prototype(
imputation_method = c("ppd", "pmm", "mode", "mcar", NA_character_),
family = multinomial(link = "logit"),
known_families = c("multinomial", "gaussian", "binomial", "quasibinomial"),
known_links = "logit"
),
validity = function(object) {
out <- TRUE
im <- getClass(class(object))@prototype@imputation_method
if(!(object@family$family %in% getClass(class(object))@prototype@known_families)) { # interval and binary are validated separately
out <- "the 'family' slot of 'ordered-categorial' class must be 'multinomial()'"
}
else if(object@family$family == "multinomial" && object@family$link != "logit") {
out <- "the 'family' slot of 'ordered-categorial' class must be 'multinomial(link = 'logit')'"
}
else if(!all(object@imputation_method %in% im)) {
out <- paste(object@variable_name, ": 'imputation_method' must be one of:\n", paste(im, collapse = ", "))
}
return(out)
}
)
## ordered categorical with known cutpoints that discretize a continuous variable (like income)
setClass("interval", representation("ordered-categorical"),
prototype(
imputation_method = c("ppd", NA_character_),
family = gaussian(),
known_families = "gaussian",
known_links = c("identity", "inverse", "log")
),
validity = function(object) {
out <- TRUE
if(!(object@imputation_method[1] == "ppd")) {
out <- paste(object@variable_name, ": 'imputation_method' must be 'ppd'")
}
else if(object@family$family != "gaussian") {
out <- "the 'family' slot of 'interval' class must be 'gaussian()'"
}
return(out)
}
)
## binary variable
# binary inherits from ordered-categorical because it often makes sense to think of
# those who are coded as 1 as having "more" of something than those who are coded as
# zero. Also, binary logit, probit, etc. are special cases of ordinal logit, probit,
# etc. with one cutpoint fixed at zero.
setClass("binary", representation("ordered-categorical"),
prototype(
family = binomial(link = "logit"),
known_families = c("binomial", "quasibinomial"),
known_links = c("logit", "probit", "cauchit", "log", "cloglog"),
cutpoints = 0.0),
validity = function(object) {
out <- TRUE
if(length(object@raw_data) == 0) return(out)
vals <- unique(object@raw_data)
vals <- vals[!is.na(vals)]
kf <- getClass(class(object))@prototype@known_families
kl <- getClass(class(object))@prototype@known_links
if(length(vals) != 2) {
out <- paste(object@variable_name, ": binary variables must have exactly two response categories")
}
else if(!identical(object@cutpoints, 0.0)) {
out <- paste(object@variable_name, ": 'cutpoints' must be 0.0 for a binary variable")
}
else if(!(object@family$family %in% kf)) {
out <- paste(object@variable_name, ": the 'family' slot of a object of class 'binary' must be one of", paste(kf, collapse = ", "))
}
else if(!(object@family$link %in% kl)) {
out <- paste(object@variable_name, ": the 'link' slot of the 'family' slot of a object of class 'binary' must be one of", paste(kl, collapse = ", "))
}
return(out)
}
)
setMethod("initialize", "binary", def =
function(.Object, ...) {
.Object <- callNextMethod()
l <- length(.Object@raw_data)
if(l == 0) return(.Object)
.Object@data <- as.integer(.Object@data == max(.Object@data, na.rm = TRUE)) + 1L
return(.Object)
})
setClass("grouped-binary", representation("binary", strata = "character"),
prototype(
imputation_method = "pmm"
),
validity = function(object) {
out <- TRUE
if(length(object@raw_data) == 0) return(out)
if(!requireNamespace("survival")) {
out <- "the 'survival' package must be installed to use 'grouped-binary' variables"
}
else if(length(object@strata) == 0) {
warning(paste("you must specify the 'strata' slot for", object@variable_name,
"see help('grouped-binary-class')"))
}
return(out)
}
)
setMethod("initialize", "grouped-binary", def =
function(.Object, ...) {
.Object <- callNextMethod()
l <- length(.Object@raw_data)
if(l == 0) return(.Object)
.Object@imputation_method <- "pmm"
return(.Object)
})
## count variables, which must be nonnegative integers
setClass("count", representation("missing_variable"),
prototype(
imputation_method = c("ppd", "pmm", "mean", "median", "expectation", "mcar", NA_character_),
family = quasipoisson(),
known_families = c("quasipoisson", "poisson"),
known_links = c("log", "identity", "sqrt")
),
validity = function(object) {
out <- TRUE
l <- length(object@raw_data)
if(l == 0) return(out)
im <- getClass(class(object))@prototype@imputation_method
if(any(object@raw_data < 0, na.rm = TRUE)) {
out <- paste(object@variable_name, ": counts must be nonnegative")
}
else if(any(object@raw_data != as.integer(object@raw_data), na.rm = TRUE)) {
out <- paste(object@variable_name, ": must contain all nonnegative integers to use the 'count' class")
}
else if(!all(object@imputation_method %in% im)) {
out <- paste(object@variable_name, ": 'imputation_method' must be one of:\n", paste(im, collapse = ", "))
}
else if(sum(object@n_unpossible)) {
out <- paste(object@variable_name, ": unpossible observations not supported for count variables yet")
}
return(out)
}
)
.identity_transform <- function(y, ...) return(y)
.standardize_transform <- function(y, mean = stop("must supply mean"), sd = stop("must supply sd"), inverse = FALSE) {
if(inverse) return(y * 2 * sd + mean)
else return( (y - mean) / (2 * sd) )
}
## continuous variables, which may have inequality restrictions or transformation functions
setClass("continuous",
representation(
"missing_variable",
transformation = "function",
inverse_transformation = "function",
transformed = "logical", # TRUE -> in transformed state
known_transformations = "character"
),
prototype(
imputation_method = c("ppd", "pmm", "mean", "median", "expectation", "mcar", NA_character_),
transformed = TRUE,
transformation = .standardize_transform,
inverse_transformation = .standardize_transform,
family = gaussian(),
known_families = c("gaussian", "Gamma", "inverse.gaussian", "binomial"), # binomial() is only for (SC_)proportions
known_links = .known_links[.known_links != "sqrt"],
known_transformations = c("standardize", "identity", "log", "logshift", "squeeze", "sqrt", "cuberoot", "qnorm")
),
validity = function(object) {
out <- TRUE
im <- getClass(class(object))@prototype@imputation_method
kf <- getClass(class(object))@prototype@known_families
kl <- getClass(class(object))@prototype@known_links
if(!all(object@imputation_method %in% im)) {
out <- paste(object@variable_name, ": 'imputation_method' must be one of:\n", paste(im, collapse = ", "))
}
else if(sum(object@n_unpossible)) {
out <- paste(object@variable_name, ": unpossible observations not supported for continuous variables yet")
}
else if(!(object@family$family %in% kf)) {
out <- paste(object@variable_name, ": the 'family' slot of a object of class 'binary' must be one of", paste(kf, collapse = ", "))
}
else if(!(object@family$link %in% kl)) {
out <- paste(object@variable_name, ": the 'link' slot of the 'family' slot of a object of class 'binary' must be one of", paste(kl, collapse = ", "))
}
return(out)
}
)
setMethod("initialize", "continuous", def =
function(.Object, ...) {
.Object <- callNextMethod()
l <- length(.Object@raw_data)
if(l == 0) return(.Object)
if(identical(.Object@transformation, .standardize_transform)) {
mean <- mean(.Object@raw_data, na.rm = TRUE)
sd <- sd(.Object@raw_data, na.rm = TRUE)
formals(.Object@transformation)$mean <- formals(.Object@inverse_transformation)$mean <- mean
formals(.Object@transformation)$sd <- formals(.Object@inverse_transformation)$sd <- sd
formals(.Object@inverse_transformation)$inverse <- TRUE
}
else if(identical(.Object@transformation, .logshift)) {
y <- .Object@raw_data
if(any(y < 0, na.rm = TRUE)) a <- - min(y, na.rm = TRUE)
else a <- 0
a <- (a + min(y[y > 0], na.rm = TRUE)) / 2
formals(.Object@transformation)$a <- formals(.Object@inverse_transformation)$a <- a
formals(.Object@inverse_transformation)$inverse <- TRUE
}
.Object@data <- .Object@transformation(.Object@raw_data)
.Object@data[.Object@which_miss] <- NA_real_
return(.Object)
})
setClass("bounded-continuous", representation("continuous", lower = "numeric", upper = "numeric"),
prototype(
imputation_method = "ppd",
transformation = .identity_transform,
inverse_transformation = .identity_transform
),
validity = function(object) {
out <- TRUE
# if(any(object@raw_data <= object@lower, na.rm = TRUE)) {
# out <- paste(object@variable_name, ": all observed data must be strictly greater than 'lower'")
# }
# else if(any(object@raw_data >= object@upper, na.rm = TRUE)) {
# out <- paste(object@variable_name, ": all observed data must be strictly less than 'upper'")
# }
if(any(object@lower > object@upper)) {
out <- paste(object@variable_name, ": lower bounds must be less than or equal to upper bounds")
}
else if(object@imputation_method != "ppd") {
out <- paste(object@variable_name, ": 'imputation_method' must be 'ppd' for 'bounded-continuous' variables ")
}
else if(!requireNamespace("truncnorm")) {
out <- paste(object@variable_name, ": the 'truncnorm' package must be installed to use the 'bounded-continuous' class")
}
return(out)
}
)
setMethod("initialize", "bounded-continuous", def =
function(.Object, lower = -Inf, upper = Inf, ...) {
.Object <- callNextMethod()
l <- length(.Object@raw_data)
if(l == 0) return(.Object)
.Object@lower <- lower
.Object@upper <- upper
return(.Object)
})
setClass("positive-continuous", representation("continuous"),
prototype(
transformation = log,
inverse_transformation = exp,
known_transformations = c("log", "sqrt", "squeeze", "qnorm")
),
validity = function(object) {
out <- TRUE
if(any(object@raw_data <= 0, na.rm = TRUE)) {
out <- paste(object@variable_name, ": positive variables must be positive")
}
return(out)
}
)
## must be on the (0,1) interval
setClass("proportion", representation("positive-continuous", link.phi = "WeAreFamily"),
prototype(
transformed = FALSE,
transformation = .identity_transform,
inverse_transformation = .identity_transform,
known_transformations = c("squeeze", "qnorm"),
family = binomial(),
known_families = c("binomial", "gaussian"),
known_links = .known_links[.known_links != "sqrt"],
link.phi = "log"),
validity = function(object) {
out <- TRUE
kf <- getClass(class(object))@prototype@known_families
kl <- getClass(class(object))@prototype@known_links
if(any(object@raw_data > 1, na.rm = TRUE)) {
out <- paste(object@variable_name, ": proportions must be on the unit interval")
}
else if(any(object@raw_data == 1, na.rm = TRUE)) {
out <- paste(object@variable_name, ": some proportions are equal to 1.0 so use the SC_proportion class")
}
else if(!(object@family$family %in% kf)) {
out <- paste(object@variable_name, ": the 'family' slot of a object of class 'proportion' must be one of", paste(kf, collapse = ", "))
}
else if(!(object@family$link %in% kl)) {
out <- paste(object@variable_name, ": the 'link' slot of the 'family' slot of a object of class 'proportion' must be one of", paste(kl, collapse = ", "))
}
else if(object@family$family == "binomial" && !requireNamespace("betareg")) {
out <- paste(object@variable_name, ": you must install the 'betareg' package to model proportions as proportions")
}
return(out)
}
)
# setClass("truncated-continuous",
# representation("continuous",
# lower = "ANY",
# upper = "ANY",
# n_lower = "integer",
# which_lower = "integer",
# n_upper = "integer",
# which_upper = "integer",
# n_both = "integer",
# which_both = "integer",
# n_truncated = "integer",
# which_truncated = "integer",
# "VIRTUAL")
# )
#
# setClass("NN_truncated-continuous", representation("truncated-continuous", lower = "numeric", upper = "numeric"))
#
# setMethod("initialize", "NN_truncated-continuous", def =
# function(.Object, ...) {
# .Object <- callNextMethod()
# l <- length(.Object@raw_data)
# if(l == 0) return(.Object)
# if(identical(.Object@transformation, .standardize_transform)) {
# mean <- mean(.Object@raw_data, na.rm = TRUE)
# sd <- sd(.Object@raw_data, na.rm = TRUE)
# formals(.Object@transformation)$mean <- formals(.Object@inverse_transformation)$mean <- mean
# formals(.Object@transformation)$sd <- formals(.Object@inverse_transformation)$sd <- sd
# formals(.Object@inverse_transformation)$inverse <- TRUE
# }
# .Object@data <- .Object@transformation(.Object@raw_data)
#
# if(length(.Object@lower) == 0 & length(.Object@upper) == 0) {
# stop("at least one of 'lower' and 'upper' must be specified")
# }
# ## FIXME: Deal with interval censoring or force it to the interval class
# .Object@n_both <- 0L
# lowers <- .Object@raw_data <= .Object@lower
# .Object@n_lower <- sum(lowers)
# .Object@which_lower <- which(lowers)
# uppers <- .Object@raw_data >= .Object@upper
# .Object@n_uppers <- sum(uppers)
# .Object@which_uppers <- which(uppers)
# .Object@n_truncated <- .Object@n_lower + .Object@n_upper
# .Object@which_truncated <- c(.Object@which_lower, .Object@which_upper)
# return(.Object)
# })
#
# setClass("FN_truncated-continuous", representation("truncated-continuous", lower = "function", upper = "numeric"))
# setClass("NF_truncated-continuous", representation("truncated-continuous", lower = "numeric", upper = "function"))
# setClass("FF_truncated-continuous", representation("truncated-continuous", lower = "function", upper = "function"))
#
# setClass("censored-continuous",
# representation("continuous",
# lower = "ANY",
# upper = "ANY",
# n_lower = "integer",
# which_lower = "integer",
# n_upper = "integer",
# which_upper = "integer",
# n_both = "integer",
# which_both = "integer",
# n_censored = "integer",
# which_censored = "integer",
# lower_indicator = "binary",
# upper_indicator = "binary",
# "VIRTUAL")
# )
# setClass("NN_censored-continuous", representation("censored-continuous", lower = "numeric", upper = "numeric"))
# setMethod("initialize", "NN_censored-continuous", def =
# function(.Object, ...) {
# .Object <- callNextMethod()
# l <- length(.Object@raw_data)
# if(l == 0) return(.Object)
# if(identical(.Object@transformation, .standardize_transform)) {
# mean <- mean(.Object@raw_data, na.rm = TRUE)
# sd <- sd(.Object@raw_data, na.rm = TRUE)
# formals(.Object@transformation)$mean <- formals(.Object@inverse_transformation)$mean <- mean
# formals(.Object@transformation)$sd <- formals(.Object@inverse_transformation)$sd <- sd
# formals(.Object@inverse_transformation)$inverse <- TRUE
# }
# .Object@data <- .Object@transformation(.Object@raw_data)
#
# if(length(.Object@lower) == 0 & length(.Object@upper) == 0) {
# stop("at least one of 'lower' and 'upper' must be specified")
# }
# ## FIXME: Deal with interval censoring or force it to the interval class
# .Object@n_both <- 0L
# lowers <- .Object@raw_data <= .Object@lower
# .Object@n_lower <- sum(lowers, na.rm = TRUE)
# .Object@which_lower <- which(lowers)
# if(.Object@n_lower > 0) {
# .Object@lower_indicator <- missing_variable(as.ordered(lowers), type = "binary",
# variable_name = paste(.Object@variable_name, "lower", sep = ""))
# }
# uppers <- .Object@raw_data >= .Object@upper
# .Object@n_upper <- sum(uppers, na.rm = TRUE)
# .Object@which_upper <- which(uppers)
# if(.Object@n_upper > 0) {
# .Object@lower_indicator <- missing_variable(as.ordered(uppers), type = "binary",
# variable_name = paste(.Object@variable_name, "upper", sep = ""))
# }
# .Object@n_censored <- .Object@n_lower + .Object@n_upper
# .Object@which_censored <- c(.Object@which_lower, .Object@which_upper)
# return(.Object)
# })
#
# setClass("FN_censored-continuous", representation("censored-continuous", lower = "function", upper = "numeric"))
# setClass("NF_censored-continuous", representation("censored-continuous", lower = "numeric", upper = "function"))
# setClass("FF_censored-continuous", representation("censored-continuous", lower = "function", upper = "function"))
setClass("semi-continuous", representation("continuous", indicator = "ordered-categorical"),
prototype(
transformation = .identity_transform,
inverse_transformation = .identity_transform)
)
.logshift <- function(y, a, inverse = FALSE) {
if(inverse) exp(y) - a
else log(y + a)
}
setClass("nonnegative-continuous", representation("semi-continuous"),
prototype(transformation = .logshift,
inverse_transformation = .logshift,
known_transformations = c("logshift", "squeeze", "identity")),
validity = function(object) {
out <- TRUE
if(any(object@raw_data < 0, na.rm = TRUE)) {
out <- paste(object@variable_name, ": nonnegative variables must be nonnegative")
}
return(out)
}
)
setMethod("initialize", "nonnegative-continuous", def =
function(.Object, ...) {
.Object <- callNextMethod()
l <- length(.Object@raw_data)
if(l == 0) return(.Object)
is_zero <- as.integer(.Object@raw_data == 0)
if(any(is_zero, na.rm = TRUE)) {
.Object@indicator <- missing_variable(is_zero, type = "binary",
variable_name = paste(.Object@variable_name, ":is_zero", sep = ""))
}
.Object@data <- .Object@transformation(.Object@raw_data)
if(!all(is.finite(.Object@data[!is.na(.Object)]))) {
stop(paste(.Object@variable_name, ": some transformed values are infinite or undefined"))
}
return(.Object)
})
.squeeze_transform <- function(y, inverse = FALSE) {
n <- length(y)
if(inverse) (y * n - .5) / (n - 1)
else (y * (n - 1) + .5) / n
}
## some values are zero and / or one
setClass("SC_proportion", representation("nonnegative-continuous", link.phi = "WeAreFamily"),
prototype(
transformation = .squeeze_transform,
inverse_transformation = .squeeze_transform,
known_transformations = c("squeeze", "qnorm"),
family = binomial(),
known_families = "binomial",
known_links = getClass("binary")@prototype@known_links,
link.phi = "log"
),
validity = function(object) {
out <- TRUE
if(any(object@data > 1, na.rm = TRUE)) {
out <- paste(object@variable_name, ": proportions must be less than or equal to 1")
}
else if(object@family$family != "binomial") {
out <- paste(object@variable_name, ": 'family' must be 'binomial'")
}
else if(!identical(body(object@transformation), body(.squeeze_transform))) {
out <- paste(object@variable_name, ": 'transformation' must be 'squeeze'")
}
else if(!requireNamespace("betareg")) {
out <- paste(object@variable_name, ": you must install the 'betareg' package to model proportions")
}
return(out)
}
)
setMethod("initialize", "SC_proportion", def =
function(.Object, ...) {
.Object <- callNextMethod()
l <- length(.Object@raw_data)
if(l == 0) return(.Object)
if(any(.Object@raw_data == 0, na.rm = TRUE)) {
if(any(.Object@raw_data == 1, na.rm = TRUE)) {
is_bound <- ifelse(.Object@raw_data == 0, -1, ifelse(.Object@raw_data == 1, 1, 0))
.Object@indicator <- missing_variable(is_bound, type = "ordered-categorical",
variable_name = paste(.Object@variable_name, ":is_bound", sep = ""))
}
else {
is_zero <- as.integer(.Object@raw_data == 0)
.Object@indicator <- missing_variable(is_zero, type = "binary",
variable_name = paste(.Object@variable_name, ":is_zero", sep = ""))
}
}
else {
is_one <- as.integer(.Object@raw_data == 1)
.Object@indicator <- missing_variable(is_one, type = "binary",
variable_name = paste(.Object@variable_name, ":is_one", sep = ""))
}
return(.Object)
})
# A missing_data.frame is a another important S4 class that is not unlike a data.frame, except
# that its "columns" (actually list elements) are objects that inherit from the missing_variable
# class. The missing_data.frame class should, in principle, contain ALL the necessary information
# regarding how the missing_variables relate to each other. Together, the missing_variable class(es)
# and the missing_data.frame class supplant the mi.info S4 class in previous versions of library(mi).
.get_slot <-
function(object, name, simplify = TRUE) {
if(isS4(object)) return(slot(object, name))
else if(is.list(object)) sapply(object, FUN = slot, name = name, simplify = simplify)
else stop("'object' not supported")
}
setOldClass("data.frame")
setClass("missing_data.frame",
representation(
variables = "list", # of missing_variables
no_missing = "logical", # basically a collection of the all_obs slots of the missing_variables
patterns = "factor", # indicates which missingness_pattern an observation belongs to
DIM = "integer", # observations x variables
DIMNAMES = "list", # list of rownames and colnames
postprocess = "function",# makes additional variables from existing variables (interactions, etc.)
index = "list", # this indicate which variables to exclude when modeling a given variable
X = "MatrixTypeThing", # ALL variables (categorical variables are in dummy-variable form)
weights = "list", # this gets passed to bayesglm() and similar modeling functions
priors = "list", # the elements of this get passed to bayesglm() and other modeling functions in arm
correlations = "matrix", # has SMCs and Spearman correlations
done = "logical", # are we done?
workpath = "character"),
contains = "data.frame",
prototype(postprocess = function() stop("postprocess does not work yet"), X = matrix(NA_real_, 0, 0), done = FALSE),
validity = function(object) {
out <- TRUE
l <- length(object@variables)
if(l == 0) return(out)
if(!all(sapply(object@variables, FUN = is, class2 = "missing_variable"))) {
out <- "all of the list elements in 'variables' must inherit from the 'missing_variable' class"
}
else if(length(unique(.get_slot(object@variables, "n_total"))) > 1) {
out <- "all missing_variables must have the same 'n_total'"
}
else if(!is.numeric(object@X)) {
out <- "'X' must be a numeric matrix"
}
missingness <- .get_slot(object@variables, "which_miss", simplify = FALSE)
varnames <- .get_slot(object@variables, "variable_name")
names(missingness) <- varnames
missingness <- missingness[sapply(missingness, length) > 0]
if(length(missingness) > 1) { ## FIXME: Very slow
combos <- combn(length(missingness), 2)
dupes <- apply(combos, 2, FUN = function(x) {
mx1 <- missingness[[x[1]]]
mx2 <- missingness[[x[2]]]
if(length(mx1) == length(mx2)) {
if(identical(mx1, mx2)) return(1L)
}
else if(length(mx1) > length(mx2)) {
if(all(mx2 %in% mx1)) return(2L)
}
else if(all(mx1 %in% mx2)) return(3L)
return(0L)
})
if(any(dupes == 1L)) {
temp <- matrix(names(missingness)[combos[,which(dupes == 1L)]], ncol = 2, byrow = TRUE)
cat("NOTE: The following pairs of variables appear to have the same missingness pattern.\n",
"Please verify whether they are in fact logically distinct variables.\n")
print(temp)
# warning("Potentially duplicated variables detected by duplicated variable detector")
}
else if(any(dupes == 2L)) {
temp <- matrix(names(missingness)[combos[,which(dupes == 2L)]], ncol = 2, byrow = TRUE)
cat("NOTE: In the following pairs of variables, the missingness pattern of the second is a subset of the first.\n",
"Please verify whether they are in fact logically distinct variables.\n")
print(temp)
}
else if(any(dupes == 3L)) {
temp <- matrix(names(missingness)[combos[,which(dupes == 3L)]], ncol = 2, byrow = TRUE)
cat("NOTE: In the following pairs of variables, the missingness pattern of the first is a subset of the second.\n",
"Please verify whether they are in fact logically distinct variables.\n")
print(temp)
}
}
return(out)
}
)
.set_priors <- function(variables, mu = 0) { ## FIXME: maybe add an option to draw from such a t distribution?
foo <- function(y) {
out <- list(prior.mean = mu, prior.scale = 2.5, prior.df = 1,
prior.mean.for.intercept = mu, prior.scale.for.intercept = 10, prior.df.for.intercept = 1)
if(is(y, "irrelevant") | y@all_obs) return(NULL)
else if(is(y, "binary")) {
if(y@family$link == "probit") {
out[[2]] <- out[[2]] * dnorm(0) / dlogis(0)
out[[4]] <- out[[4]] * dnorm(0) / dlogis(0)
}
}
else if(is(y, "categorical")) {
out <- list(prior.mean = mu, prior.scale = 2.5, prior.df = 1, prior.counts.for.bins = 1/(1 + length(y@levels)))
}
return(out)
}
out <- lapply(variables, FUN = function(y) foo(y))
for(i in seq_along(variables)) if(is(y <- variables[[i]], "semi-continuous")) out[[y@indicator@variable_name]] <- foo(y@indicator)
return(out)
}
setMethod("initialize", "missing_data.frame", def =
function(.Object, include_missingness = TRUE, skip_correlation_check = FALSE, ...) {
.Object <- callNextMethod()
l <- length(.Object@variables)
if(l == 0) return(.Object)
varnames <- names(.Object@variables)
if(is.null(varnames)) {
if(is.null(.Object@DIMNAMES[[2]])) names(.Object@variables) <- sapply(.Object@variables, FUN = .get_slot, name = "variable_name")
else names(.Object@variables) <- .Object@DIMNAMES[[2]]
}
else for(i in 1:l) .Object@variables[[i]]@variable_name <- varnames[i]
.Object@DIM <- c(.Object@variables[[1]]@n_total, l)
.Object@no_missing <- sapply(.Object@variables, FUN = .get_slot, name = "all_obs")
if(length(.Object@DIMNAMES) == 0) .Object@DIMNAMES <- list(NULL, names(.Object@variables))
Z <- lapply(.Object@variables, FUN = function(y) {
if(is(y, "irrelevant")) return(NULL) else return(is.na(y))
})
Z <- as.matrix(as.data.frame(Z[!sapply(Z, is.null)]))
if(any(apply(Z, 1, all))) {
warning("Some observations are missing on all included variables.\n",
"Often, this indicates a more complicated model is needed for this missingness mechanism")
}
uZ <- unique(Z)
if(nrow(uZ) == 1) {
if(all(uZ[1,] == 0)) patterns <- factor(rep("nothing", nrow(Z)))
else patterns <- factor(colnames(uZ)[which(uZ[1,] == 1)], nrow(Z))
}
else {
uZ <- uZ[order(rowSums(uZ)),,drop = FALSE]
patterns <- apply(Z, 1, FUN = function(x) which(apply(uZ, 1, FUN = function(u) all(u == x))))
pattern_labels <- apply(uZ, 1, FUN = function(x) paste(names(x)[x], collapse = ", "))
if(length(pattern_labels)) {
if(pattern_labels[1] == "") pattern_labels[1] <- "nothing"
pattern_lables <- paste("missing:", pattern_labels)
patterns <- factor(patterns, labels = pattern_labels, ordered = FALSE)
}
else patterns <- factor(patterns)
}
.Object@patterns <- patterns
if(!length(.Object@workpath)) {
.Object@workpath <- file.path(tempdir(), paste("mi", as.integer(Sys.time()), sep = ""))
}
dir.create(.Object@workpath, showWarnings = FALSE)
if(is(.Object, "allcategorical_missing_data.frame")) return(.Object)
Z <- Z[,!duplicated(t(Z)), drop = FALSE]
Z <- Z[,apply(Z, 2, FUN = function(x) length(unique(x))) > 1, drop = FALSE]
## FIXME: What to do if two columns of Z are collinear?
if(ncol(Z) > 0) colnames(Z) <- paste("missing", colnames(Z), sep = "_")
else include_missingness <- FALSE
X <- lapply(.Object@variables, FUN = function(x) {
if(is(x, "irrelevant")) return(NULL)
else if(is(x, "categorical")) return(.cat2dummies(x))
else if(is(x, "semi-continuous")) {
out <- cbind(x@data, .cat2dummies(x@indicator))
colnames(out) <- c(x@variable_name,
paste(x@variable_name, 2:ncol(out) - 1, sep = "_"))
return(out)
}
else if(is(x, "censored-continuous")) {
temp <- x@data
if(x@n_lower) temp <- cbind(temp, lower = x@lower_indicator@data)
if(x@n_upper) temp <- cbind(temp, upper = x@upper_indicator@data)
if(x@n_both) stop("FIXME: censoring on both sides not supported yet")
return(temp)
}
else if(is(x, "truncated-continuous")) {
temp <- x@data
n <- length(temp)
if(x@n_lower) temp <- cbind(lower = x@lower_indicator@data, temp)
if(x@n_upper) temp <- cbind(upper = x@upper_indicator@data, temp)
if(x@n_both) stop("FIXME: censoring on both sides not supported yet")
return(temp)
}
else return(x@data)
}) ## NOTE: Might need to make this more complicated in the future
X <- X[!sapply(X, is.null)]
index <- vector("list", length = length(X))
names(index) <- names(X)
start <- 2L
end <- 0L
for(i in seq_along(index)) {
end <- start + NCOL(X[[i]]) - 1L
index[[i]] <- start:end
start <- end + 1L
}
if(include_missingness) for(i in seq_along(index)) {
nas <- is.na(.Object@variables[[i]])
check <- apply(Z, 2, FUN = function(x) all(x == nas))
index[[i]] <- c(index[[i]], which(check) + start - 1)
}
else for(i in seq_along(index)) index[[i]] <- c(index[[i]], start:(start + ncol(Z) - 1))
grouped <- names(which(sapply(.Object@variables, is, class2 = "grouped-binary")))
for(i in grouped) index[[i]] <- c(index[[i]], index[[.Object@variables[[i]]@strata]], 1)
.Object@index <- index
.Object@X <- cbind("(Intercept)" = 1, as.matrix(as.data.frame(X)), Z)
correlations <- matrix(NA_real_, l,l)
if(!skip_correlation_check) for(i in 1:(l - 1)) { ## FIXME: Put SMCs in the lower triangle
if(is(.Object@variables[[i]], "irrelevant")) next
x <- try(rank(xtfrm(.Object@variables[[i]]@raw_data)), silent = TRUE)
if(!is.numeric(x)) next
for(j in (i + 1):l) {
if(is(.Object@variables[[j]], "irrelevant")) next
y <- try(rank(xtfrm(.Object@variables[[j]]@raw_data)))
if(!is.numeric(y)) next
rho <- cor(x, y, use = "pair", method = "pearson") # on ranks
if(is.finite(rho) && abs(rho) == 1) {
warning(paste(names(.Object@variables)[i], "and", names(.Object@variables)[j],
"have the same rank ordering.\n",
"Please verify whether they are in fact distinct variables.\n"))
}
if(is.finite(rho)) correlations[i,j] <- rho
}
}
.Object@correlations <- correlations
.Object@priors <- .set_priors(.Object@variables)
.Object
})
setClass("allcategorical_missing_data.frame",
representation("missing_data.frame", "Hstar" = "integer",
"parameters" = "list","latents" = "unordered-categorical"),
prototype = prototype(Hstar = 20L),
validity = function(object) {
out <- TRUE
types <- sapply(object@variables,
FUN = function(y) is(y, "irrelevant") | is(y, "categorical"))
if(!all(types)) {
out <- "all variable classes must be 'irrelevant' or 'categorical'"
}
else if(length(object@Hstar) && object@Hstar < 1) {
out <- "'Hstar' must be >= 1"
}
return(out)
})
setMethod("initialize", "allcategorical_missing_data.frame", def =
function(.Object, include_missingness = TRUE, ...) {
.Object <- callNextMethod()
l <- length(.Object@variables)
n <- nrow(.Object)
uc <- factor(rep(NA_integer_, n))
.Object@latents <- new("unordered-categorical", raw_data = rep(NA_integer_, n))
.Object@priors <- list(a = rep(1, ncol(.Object)), a_alpha = 1, b_alpha = 1)
names(.Object@priors$a) <- colnames(.Object)
return(.Object)
})
setClass("experiment_missing_data.frame", representation("missing_data.frame",
concept = "factor",
case = "character"),
validity = function(object) {
out <- TRUE
l <- length(object@concept)
if(l != length(object@variables)) {
out <- "length of 'concept' must equal the number of variables"
}
else if(!all(levels(object@concept) %in% c("outcome", "covariate", "treatment"))) {
out <- "all elements of 'concept' must be exactly one of 'outcome', 'covariate', or 'treatment'"
}
else if(sum(object@concept == "treatment") != 1) {
out <- "there must be exactly one variable designated 'treatment'"
}
else if(!is(object@variables[[which(object@concept == "treatment")]], "binary")) {
out <- "the 'treatment' variable must be of class 'binary'"
}
else if(object@variables[[which(object@concept == "treatment")]]@n_miss) {
out <- "'treatment' variable cannot have any missingness"
}
else if(length(object@case) > 1) {
out <- "'case' must be exactly one of 'outcomes', 'covariates', or 'both'"
}
else if(length(object@case) && !(object@case %in% c("outcomes", "covariates", "both"))) {
out <- "'case' must be exactly one of 'outcomes', 'covariates', or 'both'"
}
return(out)
})
setMethod("initialize", "experiment_missing_data.frame", def =
function(.Object, include_missingness = TRUE, ...) {
.Object <- callNextMethod()
l <- 1 ## FIXME
if(l == 0) return(.Object)
names(.Object@concept) <- .Object@DIMNAMES[[2]]
outcomes <- any(!.Object@no_missing[.Object@concept == "outcomes"])
covariates <- any(!.Object@no_missing[.Object@concept == "covariates"])
.Object@case <- if(outcomes & covariates) "both" else if(outcomes) "covariates" else "outcomes"
return(.Object)
})
.empty_mdf_list <- list()
class(.empty_mdf_list) <- "mdf_list"
setClass("multilevel_missing_data.frame", representation("missing_data.frame",
groups = "character",
mdf_list = "mdf_list"),
prototype(
mdf_list = .empty_mdf_list
),
validity = function(object) {
out <- TRUE
return(out)
}
)
setMethod("initialize", "multilevel_missing_data.frame", def =
function(.Object, include_missingness = TRUE, ...) {
.Object <- callNextMethod()
classes <- sapply(.Object@variables, class)
for(i in .Object@groups) classes[names(classes) == i] <- "fixed"
df <- complete(.Object, m = 0L)
mdf_list <- missing_data.frame(df, by = .Object@groups, types = classes)
.Object@mdf_list <- mdf_list
return(.Object)
})
## an object of class mi merely holds the results of a call to mi(), primary the list of missing_data.frames
setClass("mi",
representation(
call = "call",
data = "list", # of missing_data.frames
total_iters = "integer"), # how many iterations were conducted (can be a vector)
)
## an object of class pooled has regression results using the Rubin rules
setClass("pooled",
representation(
formula = "formula",
fit = "character",
models = "list",
coefficients = "numeric",
ses = "numeric",
pooled_summary = "ANY",
call = "language"),
)
mi/R/mi.R 0000644 0001762 0000144 00000120541 14247027226 011621 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
.prune_missing_variable <-
function(y, s) {
if(!is(y, "missing_variable")) stop("'y' must inherit from the 'missing_variable' class")
if(!y@all_obs) {
y@parameters <- y@parameters[1:s,,drop = FALSE]
y@imputations <- y@imputations[1:s,,drop = FALSE]
}
return(y)
}
.MPinverse <- function(eta, tol = sqrt(.Machine$double.eps)) {
cov_eta <- cov(eta)
ev <- eigen(cov_eta, TRUE)
ev$values <- ifelse(ev$values > tol, 1/ev$values, 0)
Sigma_inv <- crossprod(sqrt(ev$values)*(t(ev$vectors)))
return(Sigma_inv)
}
.mi <- function(i, y, verbose, s_start, s_end, ProcStart, max.minutes, parallel, save_models) {
mdf <- y
if(verbose) message("Chain ", i, "\n")
for(s in s_start:s_end) {
if(verbose) message("Chain ", i, " Iteration ", s, "\n")
mdf <- fit_model(data = mdf, s = s, verbose = FALSE, warn = s == s_end)
if(s > 0) {
pars <- unlist(sapply(mdf@variables, FUN = function(y) {
if(is(y, "irrelevant")) return(NA_real_)
else if(y@all_obs) return(NA_real_)
else return(y@parameters[s,,drop=TRUE])
}))
pars <- t(pars[!is.na(pars)])
fp <- file.path(mdf@workpath, paste0("pars_", i, ".csv"))
write.table(pars, file = fp, append = TRUE, sep = ",",
row.names = FALSE, col.names = FALSE)
imps <- unlist(sapply(mdf@variables, FUN = function(y) {
if(is(y, "irrelevant")) return(NA_real_)
else if(y@all_obs) return(NA_real_)
else return(y@imputations[s,,drop=TRUE])
}))
imps <- t(imps[!is.na(imps)])
fp <- file.path(mdf@workpath, paste0("imps_", i, ".csv"))
write.table(imps, file = fp, append = TRUE, sep = ",",
row.names = FALSE, col.names = FALSE)
}
Time.Elapsed <- proc.time() - ProcStart
if(((Time.Elapsed)/60)[3] > max.minutes) {
warning("'max.minutes' threshold exceeded")
break
}
}
if(((Time.Elapsed)/60)[3] > max.minutes) mdf@variables <- lapply(mdf@variables, .prune_missing_variable, s = s)
if(verbose) message("Estimating models on completed data for chain ", i, "\n")
mdf@variables <- lapply(mdf@variables, FUN = function(y) {
if(!y@all_obs & !is(y, "irrelevant")) {
model <- fit_model(y, mdf, s = s + 1, warn = TRUE)
y@fitted <- fitted(model)
if(!isS4(model)) model$x <- model$X <- model$y <- model$model <- NULL
if(save_models) y@model <- model
}
else y@model <- NULL
return(y)
})
mdf@done <- TRUE
if(verbose) message("Done with chain ", i, "\n")
return(mdf)
}
.mi_split <- function(i, y, data, verbose, s_start, s_end, ProcStart, max.minutes, parallel, save_models) {
mdf <- y
if(verbose) message("Chain ", i, "\n")
data@priors <- mdf@priors
for(s in s_start:s_end) {
if(verbose) message("Chain ", i, " Iteration ", s, "\n")
mdf <- fit_model(mdf, data, s = s, verbose = FALSE, warn = s == s_end)
Time.Elapsed <- proc.time() - ProcStart
if(((Time.Elapsed)/60)[3] > max.minutes) {
warning("'max.minutes' threshold exceeded")
break
}
}
if(((Time.Elapsed)/60)[3] > max.minutes) mdf@variables <- lapply(mdf@variables, .prune_missing_variable, s = s)
if(verbose) message("Estimating models on completed data for chain ", i, "\n")
mdf@variables <- lapply(mdf@variables, FUN = function(y) {
if(!y@all_obs & !is(y, "irrelevant")) {
model <- fit_model(y, data, s = s + 1, warn = TRUE)
y@fitted <- fitted(model)
if(!isS4(model)) model$x <- model$X <- model$y <- model$model <- NULL
if(save_models) y@model <- model
}
else y@model <- NULL
return(y)
})
mdf@done <- TRUE
if(verbose) message("Done with chain ", i, "\n")
return(mdf)
}
setMethod("mi", signature(y = "missing_data.frame", model = "missing"), def =
function(y, n.iter = 30, n.chains = 4, max.minutes = Inf, seed = NA, verbose = TRUE,
save_models = FALSE, parallel = .Platform$OS.type != "Windows")
{
call <- match.call()
if(!is.na(seed)) set.seed(seed)
if(n.iter < 0) stop(message="number of iterations must be non-negative")
ProcStart <- proc.time()
s_start <- 0
s_end <- n.iter
Time.Elapsed <- proc.time() - ProcStart
y@variables <- lapply(y@variables, FUN = function(x) {
if(!x@all_obs & !is(x, "irrelevant")) {
x@parameters <- matrix(NA_real_, nrow = n.iter, ncol = 0)
x@imputations <- matrix(NA_real_, nrow = n.iter, ncol = x@n_drawn)
if(is(x, "semi-continuous")) {
x@indicator@parameters <- matrix(NA_real_, nrow = n.iter, ncol = 0)
x@indicator@imputations <- matrix(NA_real_, nrow = n.iter, ncol = x@n_drawn)
}
}
x@done <- TRUE
return(x)
})
if(is(y, "allcategorical_missing_data.frame")) {
y@latents@imputations <- matrix(NA_integer_, nrow = n.iter, ncol = nrow(y))
y@latents@levels <- as.character(1:y@Hstar)
}
if(n.chains <= 0) return(y)
if(is.logical(parallel) && parallel) {
cores <- getOption("mc.cores", 2L)
cl <- parallel::makeCluster(cores, outfile = "")
on.exit(parallel::stopCluster(cl))
}
if(!parallel) {
mdfs <- vector("list", n.chains)
for(i in seq_along(mdfs)) {
ProcStart <- proc.time()
mdfs[[i]] <- .mi(i, y, verbose, s_start, s_end, ProcStart,
max.minutes, parallel, save_models)
}
}
else {
mdfs <- parallel::parLapply(cl, X = as.list(1:n.chains),
fun = function(i) .mi(i, y, verbose, s_start, s_end,
ProcStart, max.minutes, parallel, save_models))
}
#
# else mdfs <- mclapply(as.list(1:n.chains),
# FUN = function(i) .mi(i, y, verbose, s_start, s_end,
# ProcStart, max.minutes, parallel, save_models))
names(mdfs) <- paste("chain", 1:length(mdfs), sep = ":")
object <- new("mi",
call = call,
data = mdfs,
total_iters = as.integer(s_end))
return(object)
})
setMethod("mi", signature(y = "data.frame", model = "missing"), def =
function(y, n.iter = 30, n.chains = 4, max.minutes = Inf, seed = NA, verbose = TRUE,
save_models = FALSE, parallel = .Platform$OS.type != "Windows")
{
y <- as(y, "missing_data.frame")
return(mi(y, n.iter = n.iter, n.chains = n.chains, max.minutes = max.minutes,
seed = seed, verbose = verbose, save_models = save_models,
parallel = parallel))
})
setMethod("mi", signature(y = "matrix", model = "missing"), def =
function(y, n.iter = 30, n.chains = 4, max.minutes = Inf, seed = NA, verbose = TRUE,
save_models = FALSE, parallel = .Platform$OS.type != "Windows")
{
y <- as(y, "missing_data.frame")
return(mi(y, n.iter, n.chains, max.minutes, seed, verbose, save_models, parallel))
})
setMethod("mi", signature(y = "mi", model = "missing"),
function(y, n.iter = 30, max.minutes = Inf, seed = NA, verbose = TRUE,
save_models = FALSE, parallel = .Platform$OS.type != "Windows")
{
call <- match.call()
if(!is.na(seed)) set.seed(seed)
if(n.iter < 1) stop(message="number of iterations must be at least 1")
ProcStart <- proc.time()
total_iters <- y@total_iters
s_start <- sum(total_iters) + 1
s_end <- s_start + n.iter - 1
mdfs <- y@data
n.chains <- length(mdfs)
for(i in 1:n.chains) {
y <- mdfs[[i]]
if(TRUE) y@variables <- lapply(y@variables, FUN = function(x) {
if(x@all_obs & is(x, "irrelevant")) return(x)
x@imputations <- rbind(x@imputations, matrix(NA_integer_, n.iter, x@n_drawn))
x@parameters <- rbind(x@parameters, matrix(NA_real_, n.iter, ncol(x@parameters)))
if(is(x, "semi-continuous")) {
x@indicator@imputations <- rbind(x@indicator@imputations, matrix(NA_integer_, n.iter, x@indicator@n_drawn))
x@indicator@parameters <- rbind(x@indicator@parameters, matrix(NA_real_, n.iter, ncol(x@indicator@parameters)))
}
return(x)
})
}
if(is.logical(parallel) && parallel) {
cores <- getOption("mc.cores", 2L)
cl <- parallel::makeCluster(cores, outfile = "")
on.exit(parallel::stopCluster(cl))
}
if(!parallel) {
mdfs <- vector("list", n.chains)
for(i in seq_along(mdfs)) {
ProcStart <- proc.time()
mdfs[[i]] <- .mi(i, y, verbose, s_start, s_end, ProcStart, max.minutes,
parallel, save_models)
}
}
else {
mdfs <- parallel::parLapply(cl, as.list(1:n.chains),
fun = function(i) .mi(i, y, verbose, s_start, s_end,
ProcStart, max.minutes, parallel, save_models))
}
# else mdfs <- mclapply(as.list(1:n.chains),
# FUN = function(i) .mi(i, y, verbose, s_start, s_end,
# ProcStart, max.minutes, parallel, save_models))
object <- new("mi",
call = call,
data = mdfs,
total_iters = as.integer(c(total_iters, n.iter)))
return(object)
})
setMethod("mi", signature(y = "missing_data.frame", model = "mi"), def =
function(y, model, n.iter = sum(model@total_iters), max.minutes = 20, seed = NA,
verbose = TRUE, save_models = FALSE,
parallel = .Platform$OS.type != "Windows")
{
n.chains <- length(model)
call <- match.call()
if(!is.na(seed)) set.seed(seed)
y <- mi(y, n.chains = 0L, n.iter = n.iter)
ProcStart <- proc.time()
s_start <- 0
s_end <- n.iter
if(is.logical(parallel) && parallel) {
cores <- getOption("mc.cores", 2L)
cl <- parallel::makeCluster(cores, outfile = "")
on.exit(parallel::stopCluster(cl))
}
mdfs <- model@data
if(!parallel) {
for(i in seq_along(mdfs)) {
ProcStart <- proc.time()
mdfs[[i]] <- .mi_split(i, y, mdfs[[i]], verbose, s_start, s_end, ProcStart,
max.minutes, parallel, save_models)
}
}
else {
mdfs <- parallel::parLapply(cl, as.list(1:n.chains),
fun = function(i) .mi_split(i, y, mdfs[[i]], verbose, s_start, s_end,
ProcStart, max.minutes, parallel, save_models))
}
# else mdfs <- mclapply(as.list(1:n.chains),
# FUN = function(i) .mi_split(i, y, mdfs[[i]], verbose, s_start, s_end,
# ProcStart, max.minutes, parallel, save_models))
names(mdfs) <- paste("chain", 1:length(mdfs), sep = ":")
to_drop <- 1:ncol(model@data[[1]]@X)
for(i in 1:n.chains) {
model@data[[i]]@variables <- c(model@data[[i]]@variables, mdfs[[i]]@variables)
model@data[[i]]@no_missing <- c(model@data[[i]]@no_missing, mdfs[[i]]@no_missing)
# leave patterns as is I guess
model@data[[i]]@DIM[2] <- model@data[[i]]@DIM[2] + mdfs[[i]]@DIM[2]
model@data[[i]]@DIMNAMES[[2]] <- c(model@data[[i]]@DIMNAMES[[2]], mdfs[[i]]@DIMNAMES[[2]])
mdfs[[i]]@index <- lapply(mdfs[[i]]@index, FUN = function(x) if(is.null(x)) x else to_drop)
model@data[[i]]@index <- c(model@data[[i]]@index, mdfs[[i]]@index)
model@data[[i]]@weights <- c(model@data[[i]]@weights, mdfs[[i]]@weights)
model@data[[i]]@priors <- c(model@data[[i]]@priors, mdfs[[i]]@priors)
}
object <- new("mi",
call = call,
data = model@data,
total_iters = as.integer(s_end))
return(object)
})
setMethod("mi", signature(y = "mdf_list", model = "missing"), def =
function (y, ...) {
out <- lapply(y, FUN = mi, ...)
class(out) <- "mi_list"
return(out)
})
setMethod("mi", signature(y = "list", model = "missing"), def =
function (y, ...) {
if(!all(sapply(y, is, class2 = "mi"))) {
stop("all elements of 'y' must be mi objects or missing_data.frame objects")
}
## FIXME: should probably check that all the mi objects are based on the same missing_data.frame
mdfs <- lapply(mi, FUN = function(x) return(x@data))
object <- new("mi",
call = y[[1]]@call,
data = mdfs,
total_iters = y[[1]]@total_iters)
return(object)
})
setMethod("mi", signature(y = "mdf_list", model = "missing"),
function (y, n.iter = 30, n.chains = 4, max.minutes = Inf, seed = NA, verbose = TRUE,
save_models = FALSE, parallel = .Platform$OS.type != "Windows")
{
out <- lapply(y, mi, n.iter = n.iter, n.chains = n.chains, max.minutes = max.minutes,
seed = seed, verbose = verbose, save_models = save_models, parallel = parallel)
class(out) <- "mi_list"
return(out)
})
setMethod("mi", signature(y = "mi_list", model = "missing"), def =
function (y, ...) {
out <- lapply(y, FUN = mi, ...)
class(out) <- "mi_list"
return(out)
})
setMethod("show", signature(object = "mi"), def =
function(object) {
cat("Object of class", class(object), "with", length(object@data), "chains, each with",
sum(object@total_iters), "iterations.\n")
cat("Each chain is the evolution of an object of", class(object@data[[1]]), "class with",
nrow(object@data[[1]]), "observations on", ncol(object@data[[1]]), "variables.\n")
return(invisible(NULL))
})
setMethod("show", signature(object = "mi_list"), def =
function(object) {
sapply(object, show)
return(invisible(NULL))
})
setMethod("summary", signature(object = "mi"), def =
function(object) {
mdf <- object@data[[1]]
matrices <- complete(object, to_matrix = TRUE, include_missing = FALSE)
chains <- length(matrices)
matrices <- array(unlist(matrices), dim = c(dim(mdf), chains), dimnames = c(dimnames(mdf), NULL))
out <- vector("list", ncol(mdf))
names(out) <- colnames(mdf)
for(i in seq_along(out)) {
if(mdf@no_missing[i]) {
if(is(mdf@variables[[i]], "categorical")) {
mat <- table(matrices[,i,1])
lev <- mdf@variables[[i]]@levels
if(length(lev) && length(dim(mat)) > 1) colnames(mat) <- lev
}
else mat <- summary(matrices[,i,1])
out[[i]] <- list(is_missing = "all values observed", observed = mat)
}
else if(is(mdf@variables[[i]], "categorical")) {
mark <- is.na(mdf@variables[[i]])
mat <- table(c(matrices[,i,]), rep(mark, times = chains))
lev <- mdf@variables[[i]]@levels
if(length(lev)) rownames(mat) <- lev
colnames(mat) <- c("observed", "imputed")
out[[i]] <- list(crosstab = mat)
}
else {
missing <- is.na(mdf@variables[[i]]@raw_data)
out[[i]] <- list(is_missing = table(missing), imputed = summary(c(matrices[missing,i,])),
observed = summary(c(matrices[!missing,i,])))
}
}
return(out)
})
setMethod("traceplot", signature(x = "mi"), def =
function(x, ...) {
traceplot(mi2BUGS, ...)
})
setMethod("traceplot", signature(x = "mi_list"), def =
function(x, ...) {
traceplot(lapply(x, mi2BUGS, ...))
})
## all the mi() methods below should return the missing_variable after imputing
## need to explicitly write out methods instead of doing poor man's S4
setMethod("mi", signature(y = "missing_variable", model = "ANY"), def =
function(y, model, ...) {
stop("This method should not have been called. You need to define the relevant mi() S4 method")
})
setMethod("mi", signature(y = "missing_variable", model = "missing"), def =
function(y) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
draws <- sample(y@data[y@which_obs], size = y@n_drawn, replace = TRUE)
y@data[y@which_drawn] <- draws
return(y)
})
setMethod("mi", signature(y = "semi-continuous", model = "missing"), def =
function(y) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
y@indicator <- mi(y@indicator)
draws <- sample(y@data[y@which_obs], size = y@n_drawn, replace = TRUE)
if(is(y, "SC_proportion")) {
n <- y@n_total
if(is(y@indicator, "binary")) {
mark <- which(complete(y@indicator, m = 0L)[y@which_miss] == 1)
if(any(y@raw_data == 0, na.rm = TRUE)) draws[mark] <- .5 / n
else draws[mark] <- (n - .5) / n
}
else {
mark <- which(complete(y@indicator, m = 0L)[y@which_miss] != 0)
draws[mark] <- (draws[mark] * (n - 1) + .5) / n
}
}
else if(is(y, "nonnegative-continuous")) {
mark <- which(y@indicator@data[y@which_miss] == 1)
if(length(mark)) draws[mark] <- y@transformation(rep(0, length(mark)))
}
else stop("FIXME: semi-continuous is not supported yet")
y@data[y@which_drawn] <- draws
return(y)
})
# setMethod("mi", signature(y = "semi-continuous", model = "missing"), def =
# function(y) {
# if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
#
# categories <- 1:(ncol(y@indicator@dummies) + 1)
# draws <- sample(categories, size = y@n_drawn, replace = TRUE)
# dummies <- t(sapply(draws, FUN = function(x) x == categories))[,-1,drop = FALSE]
# y@indicator@dummies[y@which_drawn,] <- dummies
# y@indicator@data[y@which_drawn] <- draws
#
# draws <- sample(y@data[y@which_obs], size = y@n_drawn, replace = TRUE)
# if(is(y, "SC_proportion")) {
# n <- y@n_total
# if(is(y@indicator, "binary")) {
# mark <- which(complete(y@indicator, m = 0L)[y@which_miss] == 1)
# if(any(y@raw_data == 0, na.rm = TRUE)) draws[mark] <- .5 / n
# else draws[mark] <- (n - .5) / n
# }
# else {
# mark <- which(complete(y@indicator, m = 0L)[y@which_miss] != 0)
# draws[mark] <- (draws[mark] * (n - 1) + .5) / n
# }
# }
# else if(is(y, "nonnegative-continuous")) {
# mark <- which(y@indicator@data[y@which_miss] == 1)
# if(length(mark)) draws[mark] <- y@transformation(rep(0, length(mark)))
# }
#
# the_range <- range(y@data, na.rm = TRUE)
# free <- y@data[y@which_obs]
# free <- free[free != the_range[1] & free != the_range[2]]
# draws <- sample(free, size = y@n_drawn, replace = TRUE)
# y@data[y@which_drawn] <- draws
# return(y)
# })
setMethod("mi", signature(y = "bounded-continuous", model = "missing"), def =
function(y) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
a <- if(length(y@lower) == 1) y@lower else y@lower[y@which_drawn]
a <- ifelse(a == -Inf, min(y@data, na.rm = TRUE), a)
a <- ifelse(a == Inf, max(y@data, na.rm = TRUE), a)
b <- if(length(y@upper) == 1) y@upper else y@upper[y@which_drawn]
b <- ifelse(b == -Inf, min(y@data, na.rm = TRUE), b)
b <- ifelse(b == Inf, max(y@data, na.rm = TRUE), b)
draws <- runif(y@n_drawn, min = a, max = b)
y@data[y@which_drawn] <- draws
return(y)
})
setMethod("mi", signature(y = "categorical", model = "missing"), def =
function(y) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
draws <- sample(y@data[y@which_obs], size = y@n_drawn, replace = TRUE)
y@data[y@which_drawn] <- draws
return(y)
})
.draw_parameters <-
function(means, ev) {
if(any(ev$values <= 0)) return(means)
else return(means + (ev$vectors %*% (sqrt(ev$values) * rnorm(length(means))))[,1])
}
.pmm <-
function(y, eta, Sigma_inv = NULL, strata = NULL) {
if(is(y, "unordered-categorical")) {
if(is.null(Sigma_inv)) Sigma_inv <- .MPinverse(eta)
MD <- mahalanobis(eta, colMeans(eta), Sigma_inv, inverted=TRUE)
MD_observed <- MD[y@which_obs]
y_observed <- y@data[y@which_obs]
draws <- sapply(MD[y@which_drawn], FUN = function(x) {
mark <- which.min(abs(MD_observed - x))
drawmark <- c(y_observed[mark], mark)
return(drawmark)
})
}
else if(is(y, "grouped-binary")) {
draws <- sapply(y@which_drawn, FUN = function(i) {
which_same <- which(strata == strata[i])
candidates <- intersect(which_same, y@which_obs)
if(length(candidates) == 0) {
msg <- paste(y@variable_name, ": must have some observed values in each group")
stop(msg)
}
eta_can <- eta[candidates]
y_can <- y@data[candidates]
mark <- which.min(abs(eta_can - eta[i]))
drawmark <- c(y_can[mark], mark)
return(drawmark)
})
}
else {
eta_obs <- eta[y@which_obs]
y_obs <- y@data[y@which_obs]
draws <- sapply(eta[y@which_drawn], FUN = function(x) {
if(is.na(x)) return(NA_real_) # happens with semi-continuous
mark <- which.min(abs(eta_obs - x))
drawmark <- c(y_obs[mark], mark)
return(drawmark)
})
}
return(t(draws))
}
setOldClass("polr")
setMethod("mi", signature(y = "ordered-categorical", model = "polr"), def =
function(y, model, s, ...) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
if(!is.element(y@imputation_method, c("ppd", "pmm"))) badHessian <- FALSE
else if(is.null(model$Hessian)) badHessian <- FALSE
else if(!all(is.finite(model$Hessian))) badHessian <- TRUE
else {
means <- c(coef(model), model$zeta)
ev <- eigen(vcov(model), symmetric = TRUE)
badHessian <- any(ev$values <= 0)
parameters <- .draw_parameters(means, ev)
while(!badHessian &&
any(diff(parameters[-(1:ncol(model$x))]) <= 0)) { # rejection sampling on cutpoints
parameters <- .draw_parameters(means, ev)
}
}
if(badHessian && y@imputation_method == "ppd") {
warning(paste("predictive mean matching used for", y@variable_name, "on iteration", s,
"as a fallback due to Hessian error"))
old_method <- y@imputation_method
y@imputation_method <- "pmm"
y <- mi(y, model, s, ...)
y@imputation_method <- old_method
return(y)
}
else if(y@imputation_method == "ppd") {
eta <- as.vector(model$x[y@which_drawn,,drop=FALSE] %*% head(parameters, ncol(model$x)))
pfun <- switch(y@family$link, logit = plogis, probit = pnorm,
cloglog = function(q) exp(-exp(-q)), cauchit = pcauchy)
zeta <- parameters[-(1:ncol(model$x))]
draws <- sapply(eta, FUN = function(x) {
which(rmultinom(1, 1, diff(c(0,pfun(zeta - x),1))) == 1)
})
}
else if(y@imputation_method == "pmm") {
parameters <- c(coef(model), model$zeta)
eta <- model$x %*% parameters[1:ncol(model$x)]
pmm <- .pmm(y, eta)
draws <- pmm[,1]
y@fitted[y@which_drawn,] <- y@fitted[y@which_obs,][pmm[,2],]
}
else if(y@imputation_method == "median") {
predictions <- predict(model, type = "class")
draws <- rep(floor(median(predictions[y@which_obs])), y@n_drawn)
}
else if(y@imputation_method == "mode") draws <- predict(model, type = "class")[y@which_drawn]
else stop("'imputation_method' not recognized")
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
setOldClass("multinom")
setMethod("mi", signature(y = "unordered-categorical", model = "multinom"), def =
function(y, model, s, ...) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
ev <- eigen(vcov(model), symmetric = TRUE)
parameters <- .draw_parameters(t(coef(model)), ev)
if (ncol(model.matrix(model)) != nrow(parameters)) parameters <- t(parameters)
eta <- model.matrix(model) %*% parameters
if(y@imputation_method == "ppd") {
exp_eta <- matrix(pmin(.Machine$double.xmax / ncol(eta),
cbind(1, exp(eta[y@which_drawn,,drop = FALSE]))),
ncol = ncol(eta) + 1)
denom <- rowSums(exp_eta)
Pr <- exp_eta / denom
if (y@use_NA) {
Pr <- Pr[,-1]/rowSums(Pr[,-1])
badrows <- apply(is.na(Pr), 1, all)
if(any(badrows)) {
warning("Some rows of Pr are all 0 after dropping the missingness category")
Pr[badrows,] <- 1/(ncol(Pr) - 1)
}
}
draws <- apply(Pr, 1, FUN = function(p) which(rmultinom(1, 1, p) == 1))
}
else if(y@imputation_method == "pmm"){
pmm <- .pmm(y, eta)
draws <- pmm[,1]
y@fitted[y@which_drawn,,drop=FALSE] <- y@fitted[y@which_obs,,drop=FALSE][pmm[,2]]
}
else if(y@imputation_method == "mode") draws <- predict(model, type = "class")[y@which_drawn]
else stop("'imputation_method' not recognized")
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
setOldClass("RNL")
setMethod("mi", signature(y = "unordered-categorical", model = "RNL"), def =
function(y, model, s, ...) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
if(y@imputation_method == "ppd") { # imputating from the posterior predictive distribution
Pr <- sapply(model, FUN = function(m) {
ev <- eigen(vcov(m), symmetric = TRUE)
parameters <- .draw_parameters(coef(m), ev)
eta <- m$x[y@which_drawn,,drop=FALSE] %*% parameters
pred <- m$family$linkinv(eta)
return(pred)
})
if(y@use_NA) {
Pr <- Pr[,-1]/rowSums(Pr[,-1])
badrows <- apply(is.na(Pr), 1, all)
if(any(badrows)) {
warning("Some rows of Pr are all 0 after dropping the missingness category")
Pr[badrows,] <- 1/(ncol(Pr) - 1)
}
}
draws <- apply(Pr, 1, FUN = function(p) which(rmultinom(1, 1, p) == 1))
}
else if(y@imputation_method == "pmm") {
eta <- sapply(model, FUN = function(m) {
ev <- eigen(vcov(m), symmetric = TRUE)
parameters <- .draw_parameters(coef(m), ev)
eta <- m$x %*% parameters
return(eta)
})
pmm <- .pmm(y, eta)
draws <- pmm[,1]
y@fitted[y@which_drawn,,drop=FALSE] <- y@fitted[y@which_obs,,drop=FALSE][pmm[,2]]
}
else stop("only ppd and pmm are supported imputation methods in the RNL case")
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
setOldClass("glm")
setMethod("mi", signature(y = "binary", model = "glm"), def =
function(y, model, s, ...) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
if(y@imputation_method == "ppd") {
ev <- eigen(vcov(model), symmetric = TRUE)
parameters <- .draw_parameters(coef(model), ev)
eta <- model$x[y@which_drawn,,drop=FALSE] %*% parameters
pred <- model$family$linkinv(eta)
draws <- rbinom(y@n_drawn, 1, pred) + 1L
}
else if(y@imputation_method == "pmm") {
ev <- eigen(vcov(model), symmetric = TRUE)
parameters <- .draw_parameters(coef(model), ev)
eta <- model$x %*% parameters
pmm <- .pmm(y, eta)
draws <- pmm[,1]
y@fitted[y@which_drawn] <- y@fitted[y@which_obs][pmm[,2]]
}
else if(y@imputation_method == "median") {
predictions <- predict(model, type = "class")
draws <- rep(floor(median(predictions[y@which_obs])), y@n_drawn)
}
else if(y@imputation_method == "mode") draws <- predict(model, type = "class")[y@which_drawn]
else if(y@imputation_method == "mean") stop("'mean' is not a supported 'imputation_method' for binary variables")
else if(y@imputation_method == "expectation") stop("'expectation' is not a supported 'imputation_method' for binary variables")
else stop("'imputation_method' not recognized")
draws <- as.integer(draws)
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
setOldClass("clogit")
setMethod("mi", signature(y = "grouped-binary", model = "clogit"), def =
function(y, model, s, ...) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
# reconstruc the strata
Terms <- model$terms
temp <- survival::untangle.specials(Terms, "strata")
mf <- model.frame(model)
strata <- strata(mf[, temp$vars], shortlabel = TRUE)
if(y@imputation_method == "pmm") {
ev <- eigen(vcov(model), symmetric = TRUE)
parameters <- .draw_parameters(coef(model), ev)
eta <- model$x %*% parameters
draws <- .pmm(y, eta, strata = strata)[,1]
#FIXME: haven't adjusted fitted values
}
else stop("only 'pmm' is supported for 'grouped-binary' variables")
draws <- as.integer(draws)
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
setMethod("mi", signature(y = "interval", model = "glm"), def =
function(y, model, s, ...) {
stop("FIXME: write this method")
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
if(y@imputation_method == "ppd") {
stop("FIXME")
}
else stop("only ppd is supported as an imputation method for interval variables")
return(y)
})
setMethod("mi", signature(y = "categorical", model = "matrix"), def =
function(y, model, s, ...) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
if(y@imputation_method != "ppd") stop("only ppd is supported in this case")
if(nrow(model) != y@n_drawn) stop("matrix of probabilities has the wrong number of rows")
draws <- apply(model, 1, FUN = function(p) which(rmultinom(1, 1, p) == 1))
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
## helper function
.mi_continuous <-
function(y, model) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
if(y@imputation_method == "ppd") {
ev <- eigen(vcov(model), symmetric = TRUE)
parameters <- .draw_parameters(coef(model), ev)
if(model$family$family == "gaussian") {
eta <- model$x[y@which_drawn,,drop=FALSE] %*% parameters
pred <- model$family$linkinv(eta)
if(is(y, "bounded-continuous")) {
a <- if(length(y@lower) > 1) y@lower[y@which_drawn] else y@lower
b <- if(length(y@upper) > 1) y@upper[y@which_drawn] else y@upper
draws <- truncnorm::rtruncnorm(y@n_drawn, mean = pred,
sd = sqrt(model$dispersion), a = a, b = b)
}
else draws <- rnorm(y@n_drawn, pred, sqrt(model$dispersion))
}
else {
eta <- model$x %*% parameters
model$fitted <- model$family$linkinv(eta)
# model$dispersion <- parameters@sigma^2
draws <- y@family$sim(model, nsim = 1)[y@which_drawn]
}
}
else if(y@imputation_method == "pmm") {
ev <- eigen(vcov(model), symmetric = TRUE)
parameters <- .draw_parameters(coef(model), ev)
if(is(y, "semi-continuous")) {
eta <- rep(NA_real_, y@n_total)
mark <- complete(y@indicator, 0L) == 0
eta[mark] <- model$x[mark,] %*% parameters
}
else eta <- model$x %*% parameters
draws <- .pmm(y, eta)[,1]
#FIXME: haven't adjusted fitted values using pmm for continuous
}
else if(y@imputation_method == "mean") {
eta <- predict(model, type = "response")
eta_observed <- eta[y@which_obs]
eta_mean <- mean(eta_observed)
draws <- rep(eta_mean, y@n_drawn)
}
else if(y@imputation_method == "median") {
eta <- predict(model, type = "response")
eta_observed <- eta[y@which_obs]
eta_median <- median(eta_observed)
draws <- rep(eta_median, y@n_drawn)
}
else if(y@imputation_method == "expectation") draws <- predict(model, type = "response")[y@which_drawn]
else stop("'imputation_method' not recognized")
return(draws)
}
setMethod("mi", signature(y = "continuous", model = "glm"), def =
function(y, model, s, ...) {
draws <- .mi_continuous(y, model)
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
# setMethod("mi", signature(y = "censored-continuous", model = "glm"), def =
# function(y, model, s, ...) {
# not_obs <- c(y@which_drawn, y@which_censored)
# if(y@imputation_method == "ppd") {
# parameters <- arm::sim(model, 1)
# eta <- model$x[not_obs,,drop=FALSE] %*% parameters@coef[1,]
# pred <- model$family$linkinv(eta)
# draws <- rnorm(y@n_drawn, pred, parameters@sigma)
# }
# else if(y@imputation_method == "pmm") {
# eta <- predict(model, type = "link")
# eta_observed <- eta[y@which_obs]
# y_observed <- y@data[y@which_obs]
# draws <- sapply(eta[nob_obs], FUN = function(x) {
# mark <- which.min(abs(eta_observed - x))
# return(y_observed[mark])
# })
# }
# else if(y@imputation_method == "mean") {
# eta <- predict(model, type = "response")
# eta_observed <- eta[y@which_obs]
# eta_mean <- mean(eta_observed)
# draws <- rep(eta_mean, length(not_obs))
# }
# else if(y@imputation_method == "median") {
# eta <- predict(model, type = "response")
# eta_observed <- eta[y@which_obs]
# eta_median <- median(eta_observed)
# draws <- rep(floor(eta_median), length(not_obs))
# }
# else if(y@imputation_method == "expectation") draws <- predict(model, type = "response")[not_obs]
# else stop("'imputation_method' not recognized")
#
# y@data[not_obs] <- draws
# y@imputations[s,] <- draws
# return(y)
# })
setMethod("mi", signature(y = "semi-continuous", model = "glm"), def =
function(y, model, s, ...) {
stop("the semi-continuous mi() method should not have been called")
})
setMethod("mi", signature(y = "nonnegative-continuous", model = "glm"), def =
function(y, model, s, ...) {
draws <- .mi_continuous(y, model)
# now account for the fact that some draws were determined to be 0 in step 1
mark <- which(complete(y@indicator, 0L)[y@which_miss] == 1)
if(length(mark)) draws[mark] <- y@transformation(rep(0, length(mark)))
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
## helper function
.mi_proportion <-
function(y, model) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
if(!is.element(y@imputation_method, c("ppd", "pmm"))) badHessian <- FALSE
else if(is.null(model$vcov)) badHessian <- FALSE
else if(!all(is.finite(model$vcov))) badHessian <- TRUE
else {
ev <- eigen(vcov(model), TRUE)
badHessian <- any(ev$values <= 0)
means <- coef(model)
parameters <- .draw_parameters(means, ev)
# while(!badHessian && parameters[length(parameters)] <= 0) {
# parameters <- .draw_parameters(means, ev)
# }
}
if(badHessian && y@imputation_method == "ppd") {
warning(paste("predictive mean matching used for", y@variable_name,
"as a fallback due to Hessian error"))
old_method <- y@imputation_method
y@imputation_method <- "pmm"
y <- mi(y, model)
return(y@data[y@which_miss])
}
else if(y@imputation_method == "ppd") {
eta <- model$x[y@which_drawn,,drop=FALSE] %*% parameters[1:NCOL(model$x)]
mu <- model$link$mean$linkinv(eta)
phi <- model$link$precision$linkinv(parameters[length(parameters)]) ## FIXME: in the parameterized case
shape1 <- mu * phi
shape2 <- phi - shape1
draws <- rbeta(y@n_drawn, shape1, shape2)
}
else if(y@imputation_method == "pmm") {
eta <- model$x %*% parameters[-length(parameters)]
draws <- .pmm(y, eta)[,1] #FIXME: haven't adjusted fitted values for pmm
}
else if(y@imputation_method == "mean") {
mu <- predict(model)
mu_observed <- mu[y@which_obs]
mu_mean <- mean(mu_observed)
draws <- rep(mu_mean, y@n_drawn)
}
else if(y@imputation_method == "median") {
mu <- predict(model)
mu_observed <- mu[y@which_obs]
mu_median <- median(mu_observed)
draws <- rep(mu_median, y@n_drawn)
}
else if(y@imputation_method == "expectation") draws <- predict(model)[y@which_drawn]
else stop("'imputation_method' not recognized")
return(draws)
}
setOldClass("betareg")
setMethod("mi", signature(y = "proportion", model = "betareg"), def =
function(y, model, s, ...) {
draws <- .mi_proportion(y, model)
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
setMethod("mi", signature(y = "proportion", model = "glm"), def =
function(y, model, s, ...) {
draws <- .mi_continuous(y, model)
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
setMethod("mi", signature(y = "SC_proportion", model = "betareg"), def =
function(y, model, s, ...) {
draws <- .mi_proportion(y, model)
n <- y@n_total
if(is(y@indicator, "binary")) {
mark <- which(complete(y@indicator, 0L)[y@which_miss] == 1)
if(any(y@raw_data == 0, na.rm = TRUE)) draws[mark] <- .5 / n
else draws[mark] <- (n - .5) / n
}
else {
signs <- complete(y@indicator, 0L)[y@which_drawn]
draws[signs < 0] <- .5 / n
draws[signs > 0] <- (n - .5) / n
}
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
## draw from overdispersed Poisson distribution
.rpois.od <- function(n, lambda, dispersion = 1) {
if (dispersion <= 1) ans <- rpois(n, lambda)
else {
B <- 1/(dispersion-1)
A <- lambda * B
ans <- rnbinom(n, size= A , mu = lambda)
}
return(ans)
}
setMethod("mi", signature(y = "count", model = "glm"), def =
function(y, model, s, ...) {
if(y@n_drawn == 0) stop("'impute' should not have been called because there are no missing data")
if(y@imputation_method == "ppd") {
ev <- eigen(vcov(model), symmetric = TRUE)
parameters <- .draw_parameters(coef(model), ev)
eta <- model$x[y@which_drawn,,drop=FALSE] %*% parameters
pred <- model$family$linkinv(eta)
draws <- .rpois.od(y@n_drawn, pred, model$dispersion)
}
else if(y@imputation_method == "pmm") {
ev <- eigen(vcov(model), symmetric = TRUE)
parameters <- .draw_parameters(coef(model), ev)
eta <- model$x %*% parameters
draws <- .pmm(y, eta)[,1] #FIXME: haven't adjusted fitted values for pmm
}
else if(y@imputation_method == "mean") {
eta <- predict(model, type = "response")
eta_observed <- eta[y@which_obs]
eta_mean <- mean(eta_observed)
draws <- rep(round(eta_mean), y@n_drawn)
}
else if(y@imputation_method == "median") {
eta <- predict(model, type = "response")
eta_observed <- eta[y@which_obs]
eta_median <- median(eta_observed)
draws <- rep(floor(eta_median), y@n_drawn)
}
else if(y@imputation_method == "expectation") draws <- round(predict(model, type = "response")[y@which_drawn])
else stop("'imputation_method' not recognized")
draws <- as.integer(draws)
y@data[y@which_drawn] <- draws
y@imputations[s,] <- draws
return(y)
})
setMethod("mi", signature(y = "irrelevant", model = "ANY"), def =
function(y, model, ...) {
stop("The mi() method should not have been called on an 'irrelevant' variable")
})
## FIXME: account for the other stuff at the bottom of the original mi.R file
mi/R/misc.R 0000644 0001762 0000144 00000027344 12513637413 012155 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
## like sapply but for objects of mi class
mipply <- ## FIXME: should probably be a generic function instead of poor man's S4
function(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE, columnwise = TRUE, to.matrix = FALSE) {
if(is(X, "mi_list")) {
out <- lapply(X, mipply, ..., simplify = simplify, USE.NAMES = USE.NAMES,
columnwise = columnwise, to.matrix = to.matrix)
}
else if(is(X, "mi")) {
X <- complete(X, to_matrix = to.matrix)
if(columnwise) out <- sapply(X, FUN = function(x) apply(x, 2, FUN, ...),
simplify = simplify, USE.NAMES = USE.NAMES)
else out <- sapply(X, FUN, ..., simplify = simplify, USE.NAMES = USE.NAMES)
}
else if(is(X, "mdf_list")) {
out <- lapply(X, mipply, ..., simplify = simplify, USE.NAMES = USE.NAMES,
columnwise = columnwise, to.matrix = to.matrix)
}
else if(is(X, "missing_data.frame")) {
if(columnwise) out <- sapply(X, FUN = function(x) apply(x, 2, FUN, ...),
simplify = simplify, USE.NAMES = USE.NAMES)
else out <- sapply(X, FUN, ..., simplify = simplify, USE.NAMES = USE.NAMES)
}
else if(is(X, "missing_variable")) {
out <- FUN(X@data, ...)
}
else if(is(X, "mi_list")) {
out <- lapply(X, FUN = mipply, ..., simplify = simplify, USE.NAMES = USE.NAMES,
columnwise = columnwise, to.matrix = to.matrix)
}
else stop("'X' must be of class 'mi', 'missing_data.frame', 'missing_variable', or 'mi_list'")
return(out)
}
## create a bugs array from an mi object
mi2BUGS <-
function(imputations, statistic = c("moments", "imputations", "parameters")) {
if(is(imputations, "mi_list")) return(lapply(imputations, FUN = mi2BUGS, statistic = statistic))
else if(!is(imputations, "mi")) stop("imputations must be an object of class 'mi' or 'mi_list'")
statistic <- match.arg(statistic)
if(statistic == "moments") {
iterations <- sum(imputations@total_iters)
mark <- !imputations@data[[1]]@no_missing &
!sapply(imputations@data[[1]]@variables, is, class2 = "irrelevant")
means <- lapply(1:iterations, FUN = function(m) {
matrices <- lapply(imputations@data, FUN = complete, m = m, to_matrix = TRUE, include_missing = FALSE)
out <- sapply(matrices, colMeans)[mark,,drop = FALSE]
return(out)
})
sds <- lapply(1:iterations, FUN = function(m) {
matrices <- lapply(imputations@data, FUN = complete, m = m, to_matrix = TRUE, include_missing = FALSE)
out <- sapply(matrices, FUN = function(x) apply(x, 2, sd))[mark,,drop = FALSE]
return(out)
})
dims <- dim(means[[1]])
arr <- array(NA_real_, c(iterations, dims[2], 2 * dims[1]),
list(NULL, NULL, c(paste("mean", rownames(means[[1]]), sep = "_"),
paste("sd", rownames(means[[1]]), sep = "_"))))
for(i in seq_along(means)) for(j in 1:ncol(arr)) {
arr[i,j, 1:dims[1]] <- means[[i]][,j]
arr[i,j,-c(1:dims[1])] <- sds[[i]][,j]
}
}
else if(statistic == "imputations") {
imp_list <- lapply(imputations@data, function(x) lapply(x@variables, function(y) y@imputations))
n.parameters <- rapply(imp_list, ncol)
arr <- array(NA_real_, c(sum(imputations@total_iters), length(imp_list), n.parameters)) ## FIXME: names?
for(i in seq_along(imp_list)) arr[,i,] <- unlist(imp_list[[i]])
}
else arr <- get_parameters(imputations)
return(arr) # compatible with R2WinBUGS
}
##Outputs completed data in either Stata (.dta) format or comma-separated (.csv) format
mi2stata <-
function(imputations, m, file, missing.ind=FALSE, ...) {
if(grepl("\\.csv$", file)) type <- "csv"
if(grepl("\\.dta$", file)) type <- "dta"
else if(!is(imputations, "mi")) stop("imputations must be an object of class 'mi'")
else if(!is(file, "character")) stop("filename must be specified as a character object")
else if(type!="dta" & type!="csv") stop("file type must be 'dta' for stata format or 'csv' for comma-separated format")
message("Note: after loading the data into Stata, version 11 or later, type 'mi import ice' to register the data as being multiply imputed.
For Stata 10 and earlier, install MIM by typing 'findit mim' and include 'mim:' as a prefix for any command using the MI data.")
unpos <- sum(sapply(imputations@data[[1]]@variables, FUN=function(x){x@n_unpossible}))
if (unpos>0 & !missing.ind) {
missing.ind <- TRUE
warning("There are legitimately skipped values in the data that were not imputed. Including variables to indicate which missing values
were imputed. Values which are still missing but are not indicated are legitimate skips.")
}
if (unpos>0 & missing.ind) {
warning("There are legitimately skipped values in the data that were not imputed. Values which are still missing but are not indicated are legitimate skips.")
}
data.list <- complete(imputations, m)
if (missing.ind) miss.indic <- data.list[[1]][,which(!is.element(colnames(data.list[[1]]), names(imputations@data[[1]]@variables)))]
vars <- which(is.element(colnames(data.list[[1]]), names(imputations@data[[1]]@variables)))
stata.data <- data.list[[1]][,vars]
stata.miss <- sapply(imputations@data[[1]]@variables, FUN=function(x){
v <- is.element(1:x@n_total, x@which_drawn)
return(v)
}, simplify=TRUE)
is.na(stata.data) <- stata.miss
if (missing.ind) stata.data <- cbind(stata.data, miss.indic)
stata.data$mi <- 1:nrow(stata.data); stata.data$mj <- 0
for(i in seq_along(data.list)){
dl <- data.list[[i]]
if(!missing.ind) dl <- dl[,vars]
dl$mi <- 1:nrow(dl)
dl$mj <- i
stata.data <- rbind(stata.data, dl)
}
colnames(stata.data)[which(colnames(stata.data)=="mi")] <- "_mi"
colnames(stata.data)[which(colnames(stata.data)=="mj")] <- "_mj"
if(type=="dta") foreign::write.dta(stata.data, file=file, version = 7L, ...)
else if(type=="csv") write.table(stata.data, file=file, sep=",", col.names=TRUE, row.names=FALSE)
}
## Returns the Gelman statistic
Rhats <-
function(imputations, statistic = c("moments", "imputations", "parameters")) {
BUGS <- mi2BUGS(imputations, statistic)
make_Rhat <- function(x) {
m <- ncol(x)
if(m < 2) stop("need at least 2 chains to calculate an R-hat")
iter <- nrow(x)
xbars <- colMeans(x)
variances <- apply(x, MARGIN = 2:3, FUN = sd)^2
W <- colMeans(variances)
B <- iter * apply(xbars, MARGIN = 2, FUN = var)
R <- sqrt( (iter - 1) / iter + 1 / iter * B / W )
return(R)
}
if(is(imputations, "mi")) return(make_Rhat(BUGS))
else return(sapply(BUGS, FUN = make_Rhat))
}
## tests whether a method is the one defined in my (as opposed to a user-defined method in .GlobalEnv)
is.method_in_mi <-
function(generic, ...) {
method <- selectMethod(generic, signature(...))
return(environmentName(environment(method@.Data)) == "mi")
}
## cube root transformation
.cuberoot <-
function(y, inverse = FALSE) {
if(inverse) y^3
else y^(1/3)
}
.parse_trans <-
function(trans) {
if(identical(names(formals(trans)), c("y", "mean", "sd", "inverse"))) return("standardize")
if(identical(names(formals(trans)), c("y", "a", "inverse"))) return("logshift")
if(identical(body(trans), body(.squeeze_transform))) return("squeeze")
if(identical(body(trans), body(.identity_transform))) return("identity")
if(identical(body(trans), body(log))) return("log")
if(identical(body(trans), body(sqrt))) return("sqrt")
if(identical(body(trans), body(.cuberoot))) return("cuberoot")
if(identical(body(trans), body(qnorm))) return("qnorm")
return("user-defined")
}
.prune <- function(class) {
classes <- names(getClass(class, where = "mi")@subclasses)
classes <- classes[!sapply(classes, isVirtualClass, where = "mi")]
if(!isVirtualClass(class, where = "mi")) classes <- c(class, classes)
return(classes)
}
.possible_missing_variable <-
function(y) { ## FIXME: update this function whenever you tweak the missing_variable tree
mvs <- .prune("missing_variable")
maybe <- rep(TRUE, length(mvs))
names(maybe) <- mvs
if(is.factor(y)) y <- factor(y) # to drop unused levels
vals <- unique(y)
vals <- sort(vals[!is.na(vals)])
if(length(vals) == 1) {
maybe[] <- FALSE
maybe["irrelevant"] <- TRUE
maybe[.prune("fixed")] <- TRUE
return(maybe)
}
else maybe[.prune("fixed")] <- FALSE
if(!all(table(y) > 1)) maybe[.prune("categorical")] <- FALSE
if(length(vals) == 2) { # permit binary plus children but not other kinds of categorical
maybe[.prune("categorical")] <- FALSE
maybe[.prune("binary")] <- TRUE
maybe[.prune("semi-continuous")] <- FALSE
}
else {
maybe[.prune("binary")] <- FALSE
}
if(!is.numeric(vals)) {
maybe[.prune("continuous")] <- FALSE
maybe[.prune("count")] <- FALSE
return(maybe)
}
if(any(vals < 0)) {
maybe[.prune("nonnegative-continuous")] <- FALSE
maybe[.prune("positive-continuous")] <- FALSE
maybe[.prune("count")] <- FALSE
return(maybe)
}
if(any(vals == 0)) maybe[.prune("positive-continuous")] <- FALSE
else maybe[.prune("nonnegative-continuous")] <- FALSE # unless SC_proportion
if(!any(vals < 1 && vals > 0)) {
maybe[.prune("SC_proportion")] <- FALSE
maybe[.prune("proportion")] <- FALSE
}
else if(any(vals >= 1)) {
maybe[.prune("proportion")] <- FALSE
if(any(vals > 1)) maybe[.prune("SC_proportion")] <- FALSE
else maybe[.prune("SC_proportion")] <- TRUE
}
if(any(vals != as.integer(vals))) {
maybe[.prune("count")] <- FALSE
maybe[.prune("categorical")] <- FALSE
}
return(maybe)
}
.cat2dummies <-
function(y) {
if(!is(y, "categorical")) stop("must be a categorical variable")
if(is(y, "binary")) out <- as.matrix(as.integer(y@data == 1))
else {
levels <- sort(unique(y@data))
out <- t(sapply(y@data, FUN = function(x) as.integer(x == levels)[-1]))
}
return(out)
}
setMethod("fitted", signature(object = "RNL"), def =
function(object, ...) {
Pr <- sapply(object, FUN = function(m) {
eta <- m$x %*% coef(m)
pred <- m$family$linkinv(eta)
return(pred)
})
Pr <- Pr / rowSums(Pr)
return(Pr)
})
setMethod("fitted", signature(object = "clogit"), def =
function(object, ...) {
target <- mean(as.numeric(object$y))
lp <- object$linear.predictors
foo <- function(par) {
intercept <- qlogis(par)
mean(plogis(intercept + lp)) - target
}
opt <- uniroot(foo, lower = 0, upper = 1)
return(plogis(qlogis(opt$root) + lp))
})
# Borrowed from library(MCMCpack)
.rdirichlet <-
function(n, alpha) {
l <- length(alpha)
x <- matrix(rgamma(l * n, alpha), ncol = l, byrow = TRUE)
sm <- rowSums(x)
return(x / sm)
}
mi/R/change_transformation.R 0000644 0001762 0000144 00000016301 12513634171 015562 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
## these change the transformation and inverse_transformation slots of a continuous variable
setMethod("change_transformation", signature(data = "missing", y = "missing_variable", to = "function"), def =
function(y, to, inverse = FALSE) {
if(!is(y, "continuous")) stop(paste(y@variable_name, "is not a continuous variable and hence has no transformation"))
else if(is(y, "SC_proportion")) stop(paste(y@variable_name, "is a SC_proportion and cannot change its transformation (yet)"))
if(inverse) {
if(identical(to, .standardize_transform)) {
formals(to)$mean <- mean(y@raw_data, na.rm = TRUE)
formals(to)$sd <- sd(y@raw_data, na.rm = TRUE)
}
else if(identical(to, .logshift)) {
yy <- y@raw_data
if(any(yy < 0, na.rm = TRUE)) a <- - min(yy, na.rm = TRUE)
else a <- 0
a <- (a + min(yy[yy > 0], na.rm = TRUE)) / 2
formals(to)$a <- a
}
if("inverse" %in% names(formals(to))) formals(to)$inverse <- TRUE
y@inverse_transformation <- to
}
else {
if(identical(to, .standardize_transform)) {
formals(to)$mean <- mean(y@raw_data, na.rm = TRUE)
formals(to)$sd <- sd(y@raw_data, na.rm = TRUE)
}
else if(identical(to, .logshift)) {
yy <- y@raw_data
if(any(yy < 0, na.rm = TRUE)) a <- - min(yy, na.rm = TRUE)
else a <- 0
a <- (a + min(yy[yy > 0], na.rm = TRUE)) / 2
formals(to)$a <- a
}
y@transformation <- to
y@data <- y@transformation(y@raw_data)
}
return(y)
})
setMethod("change_transformation", signature(data = "missing", y = "missing_variable", to = "missing"), def =
function(y) {
if(is(y, "continuous")) cat("Likely choices include:", y@known_transformations, sep = "\n")
else cat("No transformation possible for non-continuous variables\n")
return(invisible(NULL))
})
setMethod("change_transformation", signature(data = "missing_data.frame", y = "character", to = "missing"), def =
function(data, y) {
if(all(y %in% c("continuous", names(getClass("continuous")@subclasses)))) {
classes <- sapply(data@variables, class)
y <- c(sapply(y, FUN = function(x) {
names(classes[which(classes == x)])
}))
if(is.list(y)) stop(paste("no variables of class", names(y)[1]))
else y <- y[1]
}
y <- match.arg(y, data@DIMNAMES[[2]], several.ok = TRUE)
for(i in 1:length(y)) change_transformation(y = data@variables[[y[i]]])
return(data)
})
setMethod("change_transformation", signature(data = "missing_data.frame", y = "character", to = "character"), def =
function(data, y, to) {
if(length(to) == 1) to <- rep(to, length(y))
else if(length(to) != length(y)) stop("'y' and 'to' must have the same length")
if(all(y %in% c("continuous", names(getClass("continuous")@subclasses)))) {
classes <- sapply(data@variables, class)
y <- c(sapply(y, FUN = function(x) {
names(classes[which(classes == x)])
}))
to <- rep(to[1], length(y))
}
y <- match.arg(y, data@DIMNAMES[[2]], several.ok = TRUE)
trans <- lapply(to, FUN = function(x) {
switch(x,
"identity" = .identity_transform,
"standardize" = .standardize_transform,
"squeeze" = .squeeze_transform,
"logshift" = .logshift,
"log" = log,
"sqrt" = sqrt,
"cuberoot" = .cuberoot,
function(...) stop(paste("must replace the transformation slot for", x)))
})
inverse <- lapply(to, FUN = function(x) {
switch(x,
"identity" = .identity_transform,
"standardize" = .standardize_transform,
"squeeze" = .squeeze_transform,
"logshift" = .logshift,
"log" = exp,
"sqrt" = function(y, ...) y^2,
"cuberoot" = .cuberoot,
function(...) stop(paste("must replace the inverse_transformation slot for", x)))
})
for(i in 1:length(y)) {
data@variables[[y[i]]] <- change_transformation(y = data@variables[[y[i]]], to = trans[[i]])
data@variables[[y[i]]] <- change_transformation(y = data@variables[[y[i]]], to = inverse[[i]], inverse = TRUE)
mark <- data@index[[y[i]]][1]
data@X[,mark] <- data@variables[[y[i]]]@data
}
# initialize(data)
return(data)
})
setMethod("change_transformation", signature(data = "missing_data.frame", y = "numeric", to = "character"), def =
function(data, y, to) {
return(change_transformation(data = data, y = colnames(data)[y], to = to))
})
setMethod("change_transformation", signature(data = "missing_data.frame", y = "logical", to = "character"), def =
function(data, y, to) {
if(length(y) != data@DIM[2]) {
stop("the length of 'y' must equal the number of variables in 'data'")
}
return(change_transformation(data = data, y = names(data@variables)[y], to = to))
})
setMethod("change_transformation", signature(data = "missing_data.frame", y = "character", to = "function"), def =
function(data, y, to, inverse = stop("you must specify 'inverse = FALSE' or 'inverse = TRUE'")) {
if(all(y %in% c("continuous", names(getClass("continuous")@subclasses)))) {
classes <- sapply(data@variables, class)
y <- c(sapply(y, FUN = function(x) {
names(classes[which(classes == x)])
}))
}
y <- match.arg(y, data@DIMNAMES[[2]], several.ok = TRUE)
for(i in 1:length(y)) {
if(inverse) data@variables[[y[i]]] <- change_transformation(y = data@variables[[y[i]]], to = to, inverse = TRUE)
else data@variables[[y[i]]] <- change_transformation(y = data@variables[[y[i]]], to = to, inverse = FALSE)
mark <- data@index[[y[i]]][1]
data@X[,mark] <- data@variables[[y[i]]]@data
}
return(data)
})
setMethod("change_transformation", signature(data = "missing_data.frame", y = "numeric", to = "function"), def =
function(data, y, to, inverse) {
y <- names(data@variables)[y]
return(change_transformation(data = data, y = y, to = to, inverse))
})
setMethod("change_transformation", signature(data = "missing_data.frame", y = "logical", to = "function"), def =
function(data, y, to, inverse) {
if(length(y) != data@DIM[2]) {
stop("the length of 'y' must equal the number of variables in 'data'")
}
return(change_transformation(data = data, y = names(data@variables)[y], to = to, inverse = inverse))
})
mi/R/change_model.R 0000644 0001762 0000144 00000015323 12513634171 013617 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
## these are convience functions that implicitly change something else by changing the model buzzword
setMethod("change_model", signature(data = "missing", y = "missing_variable", to = "character"), def =
function(y, to) {
switch(to,
"logit" = new("binary", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = binomial(link = "logit")),
"probit" = new("binary", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = binomial(link = "probit")),
"cauchit" = new("binary", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = binomial(link = "cauchit")),
"cloglog" = new("binary", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = binomial(link = "cloglog")),
"qlogit" = new("binary", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = quasibinomial(link = "logit")),
"qprobit" = new("binary", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = quasibinomial(link = "probit")),
"qcauchit" = new("binary", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = quasibinomial(link = "cauchit")),
"qcloglog" = new("binary", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = quasibinomial(link = "cloglog")),
"ologit" = new("ordered-categorical", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = multinomial(link = "logit")),
"oprobit" = new("ordered-categorical", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = multinomial(link = "probit")),
"ocauchit" = new("ordered-categorical", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = multinomial(link = "cauchit")),
"ocloglog" = new("ordered-categorical", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = multinomial(link = "cloglog")),
"mlogit" = new("unordered-categorical", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = multinomial(link = "logit")),
"RNL" = new("unordered-categorical", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = binomial(link = "logit")),
"qpoisson" = new("count", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = quasipoisson(link = "log")),
"poisson" = new("count", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = poisson(link = "log")),
"linear" = new("continuous", variable_name = y@variable_name, raw_data = y@raw_data,
imputation_method = y@imputation_method, family = gaussian(link = "identity")),
stop("model not recognized")
)
})
setMethod("change_model", signature(data = "missing_data.frame", y = "character", to = "character"), def =
function(data, y, to) {
if(length(to) == 1) to <- rep(to, length(y))
else if(length(to) != length(y)) stop("'y' and 'to' must have the same length")
if(all(y %in% names(getClass("missing_variable")@subclasses))) {
classes <- sapply(data@variables, class)
y <- c(sapply(y, FUN = function(x) {
names(classes[which(classes == x)])
}))
if(is.list(y)) stop(paste("no variables of class", names(y)[1]))
to <- rep(to[1], length(y))
}
y <- match.arg(y, data@DIMNAMES[[2]], several.ok = TRUE)
check <- FALSE
for(i in 1:length(y)) {
categorical <- is(data@variables[[y[i]]], "categorical")
data@variables[[y[i]]] <- change_model(y = data@variables[[y[i]]], to = to[i])
if(categorical & !is(data@variables[[y[i]]], "categorical")) check <- TRUE
if(!categorical & is(data@variables[[y[i]]], "categorical")) check <- TRUE
}
if(check) return(new(class(data), variables = data@variables))
else return(data)
})
setMethod("change_model", signature(data = "missing_data.frame", y = "numeric", to = "character"), def =
function(data, y, to) {
if(length(to) == 1) to <- rep(to, length(y))
else if(length(to) != length(y)) stop("'y' and 'to' must have the same length")
for(i in 1:length(y)) {
categorical <- is(data@variables[[y[i]]], "categorical")
data@variables[[y[i]]] <- change_model(y = data@variables[[y[i]]], to = to[[i]])
if(categorical & !is(data@variables[[y[i]]], "categorical")) check <- TRUE
if(!categorical & is(data@variables[[y[i]]], "categorical")) check <- TRUE
}
if(check) return(new(class(data), variables = data@variables))
else return(data)
})
setMethod("change_model", signature(data = "missing_data.frame", y = "logical", to = "character"), def =
function(data, y, to) {
if(length(y) != data@DIM[2]) {
stop("the length of 'y' must equal the number of variables in 'data'")
}
return(change_model(data, which(y), to))
})
mi/R/change_size.R 0000644 0001762 0000144 00000005253 12513634171 013472 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
setMethod("change_size", signature(data = "missing", y = "missing_variable", to = "integer"), def =
function(y, to) {
n <- to
if(n <= 0) {
y@data <- y@data[-y@which_extra]
y@which_extra <- integer(0)
y@n_total <- y@n_total - y@n_extra
y@n_extra <- NA_integer_
return(y)
}
end <- y@n_total
SEQ <- (end+1):(end+n)
y@data <- c(y@data, rep(NA, n))
y@which_extra <- c(y@which_extra, SEQ)
y@n_extra <- y@n_extra + n
y@n_total <- y@n_total + n
return(y)
})
setMethod("change_size", signature(data = "missing", y = "categorical", to = "integer"), def =
function(y, to) {
n <- to
if(n <= 0) {
y@data <- y@data[-y@which_extra]
y@which_extra <- integer(0)
y@n_total <- y@n_total - y@n_extra
y@n_extra <- NA_integer_
return(y)
}
end <- y@n_total
SEQ <- (end+1):(end+n)
y@data <- c(y@data, rep(NA, n))
y@which_extra <- c(y@which_extra, SEQ)
y@n_extra <- y@n_extra + n
y@n_total <- y@n_total + n
return(y)
})
setMethod("change_size", signature(data = "missing", y = "fixed", to = "integer"), def =
function(y, to) {
n <- to
if(n <= 0) {
y@data <- y@data[-y@which_extra]
y@which_extra <- integer(0)
y@n_total <- y@n_total - y@n_extra
y@n_extra <- NA_integer_
return(y)
}
end <- y@n_total
SEQ <- (end+1):(end+n)
y@data <- c(y@data, rep(y@data[1], n))
y@which_extra <- c(y@which_extra, SEQ)
y@n_extra <- y@n_extra + n
y@n_total <- y@n_total + n
return(y)
})
setMethod("change_size", signature(data = "missing_data.frame", y = "missing", to = "integer"), def =
function(data, to) {
n <- to
data@variables <- lapply(data@variables, FUN = function(x) change_size(x, n))
data@DIM[1] <- data@variables[[1]]@n_total
return(data)
})
mi/R/zzz.R 0000644 0001762 0000144 00000003447 12513634171 012053 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
.onLoad <- function(lib, pkg) {
# library.dynam("mi", pkg, lib)
return(invisible(NULL))
}
.onUnload <- function(libpath) {
# library.dynam.unload("mi", libpath)
return(invisible(NULL))
}
.onAttach <- function( ... ) {
miLib <- dirname(system.file(package = "mi"))
version <- utils::packageDescription("mi", lib.loc = miLib)$Version
builddate <- utils::packageDescription("mi", lib.loc = miLib)$Packaged
packageStartupMessage(paste("mi (Version ", version, ", packaged: ", builddate, ")", sep = ""))
packageStartupMessage("mi Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University")
packageStartupMessage("This program comes with ABSOLUTELY NO WARRANTY.")
packageStartupMessage("This is free software, and you are welcome to redistribute it")
packageStartupMessage("under the General Public License version 2 or later.")
packageStartupMessage("Execute RShowDoc('COPYING') for details.")
}
mi/R/tobin5.R 0000644 0001762 0000144 00000013211 12513634171 012404 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
# This is superceded by the tobin5() function below
tobin5 <-
function(mdf, y, f = NULL) {
if(!is(mdf, "missing_data.frame")) stop("'mdf' must be a 'missing_data.frame'")
if(!is.character(y)) stop("'y' must be a character string")
if(length(y) != 1) stop("'y' must have length one")
if(!(y %in% colnames(mdf))) stop("'y' must be a variable in 'mdf'")
y <- mdf@variables[[y]]
NAs <- is.na(y)
to_drop <- mdf@index[[y@variable_name]]
X <- mdf@X[,-to_drop]
probit <- bayesglm.fit(X, y = NAs, family = binomial(link = "probit"))
class(probit) <- c("bayesglm", "glm", "lm")
gamma <- coef(probit)
IMR_0 <- dnorm(-fitted(probit)) / pnorm(-fitted(probit))
IMR_1 <- dnorm( fitted(probit)) / pnorm( fitted(probit))
if(is.null(f)) {
mark <- colnames(mdf@X)[!grepl("^missing_", colnames(mdf@X))][-1]
mark <- mark[mark != y@variable_name]
f <- paste(mark, collapse = " + ")
f <- paste(y@variable_name, " ~ ", f, " + IMR", sep = "")
f <- as.formula(f)
}
else if(!is(f, "formula")) stop("'f' must be 'NULL' or a formula")
df <- as.data.frame(cbind(mdf@X, IMR = IMR_0))
if(is(y, "continuous")) {
model_0 <- bayesglm(f, family = gaussian, data = df, subset = !NAs)
}
else stop("only continuous dependent variables are supported at the moment")
df <- as.data.frame(cbind(mdf@X, IMR = IMR_1))
if(is(y, "continuous")) {
model_1 <- bayesglm(f, family = gaussian, data = df, subset = NAs)
}
se_0 <- model_0$dispersion
se_1 <- model_1$dispersion
delta_0 <- IMR_0^2 - fitted(probit) * IMR_0
delta_1 <- IMR_1^2 + fitted(probit) * IMR_1
betaL_0 <- coef(model_0)
betaL_0 <- betaL_0[length(betaL_0)]
betaL_1 <- coef(model_1)
betaL_1 <- betaL_1[length(betaL_1)]
sigma_0 <- sqrt(se_0^2 + (betaL_0 * delta_0)^2)
sigma_1 <- sqrt(se_1^2 + (betaL_1 * delta_1)^2)
rho_0 <- -betaL_0 / sigma_0
rho_1 <- betaL_1 / sigma_1
## FIXME: correct vcov(model_0) and vcov(model_1) now
return(list(probit = probit, model_0 = model_0, model_1 = model_1,
rho_0 = rho_0, rho_1 = rho_1))
}
tobin5 <-
function(imputations, y, f = NULL) {
if(!is(imputations, "mi")) stop("'imputations' must be a 'mi' object")
if(!is.character(y)) stop("'y' must be a character string")
if(length(y) != 1) stop("'y' must have length one")
if(!(y %in% colnames(imputations))) stop("'y' must be a variable in 'imputations'")
dfs <- complete(imputations)
mdf <- imputations@data[[1]]
to_drop <- mdf@index[[y@variable_name]]
cn <- colnames(mdf@X[,-to_drop])[-1]
f1 <- paste(cn, collapse = " + ")
NAs <- is.na(mdf@variables[[y]])
if(paste("missing", y, sep = "_") %in% colnames(mdf@X)) {
f1 <- paste(paste("missing", y, sep = "_"), "~", f1)
}
else for(i in seq_along(dfs)) {
dfs[[i]] <- cbind(dfs[[i]], NAs)
colnames(dfs[[i]]) <- c(colnames(dfs[[i]]), paste("missing", y, sep = "_"))
}
f1 <- as.formula(f1)
probit <- pool(f1, data = dfs, family = binomial(link = "probit"))
gamma <- sapply(probit@models, coef)
Pr <- sapply(probit@models, fitted)
IMR_0 <- apply(Pr, 2, FUN = function(p) dnorm(-p) / pnorm(-p))
IMR_1 <- apply(Pr, 2, FUN = function(p) dnorm( p) / pnorm( p))
if(is.null(f)) {
mark <- colnames(mdf@X)[!grepl("^missing_", colnames(mdf@X))][-1]
mark <- mark[mark != y]
f <- paste(mark, collapse = " + ")
f <- paste(y@variable_name, " ~ ", f, " + IMR", sep = "")
f <- as.formula(f)
}
else if(!is(f, "formula")) stop("'f' must be 'NULL' or a formula")
if(!is(mdf@variables[[y]], "continuous")) {
stop("only continuous dependent variables are supported at the moment")
}
for(i in seq_along(dfs)) dfs[[i]]$IMR <- IMR_0[,i]
model_0 <- pool(f, data = dfs, family = gaussian, subset = !NAs)
for(i in seq_along(dfs)) dfs[[i]]$IMR <- IMR_1[,i]
model_1 <- pool(f, data = dfs, family = gaussian, subset = NAs)
se_0 <- sapply(model_0@models, FUN = function(m) m$dispersion)
se_1 <- sapply(model_1@models, FUN = function(m) m$dispersion)
delta_0 <- IMR_0^2 - Pr * IMR_0
delta_1 <- IMR_1^2 + Pr * IMR_1
betaL_0 <- sapply(model_0@models, coef)
betaL_0 <- betaL_0[nrow(betaL_0)]
betaL_1 <- sapply(model_1@models, coef)
betaL_1 <- betaL_1[nrow(betaL_1)]
sigma_0 <- sqrt(se_0^2 + sweep(delta_0, 2, betaL_0, FUN = "*")^2)
sigma_1 <- sqrt(se_1^2 + sweep(delta_1, 2, betaL_1, FUN = "*")^2)
rho_0 <- -betaL_0 / sigma_0
rho_1 <- betaL_1 / sigma_1
## FIXME: correct vcov(model_0) and vcov(model_1) now
return(list(probit = probit, model_0 = model_0, model_1 = model_1,
rho_0 = rho_0, rho_1 = rho_1))
}
mi/R/fit_model.R 0000644 0001762 0000144 00000073120 14247027001 013145 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
## these fit a regression and return the model
# note, helper functions are good because they are checked more rigorously by R CMD check
setMethod("fit_model", signature(y = "missing_variable", data = "missing_data.frame"), def =
function(y, data, s, warn, ...) {
stop("This method should not have been called. You need to define the relevant fit_model() S4 method")
})
setMethod("fit_model", signature(y = "irrelevant", data = "missing_data.frame"), def =
function(y, data, ...) {
stop("'fit_model' should not have been called on an 'irrelevant' variable")
})
setMethod("fit_model", signature(y = "binary", data = "missing_data.frame"), def =
function(y, data, s, warn, X = NULL, ...) {
if(is.null(X)) {
to_drop <- data@index[[y@variable_name]]
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
if(is(data, "experiment_missing_data.frame")) {
treatment <- names(which(data@concept == "treatment"))
if(data@concept[y@variable_name] == "outcome") {
X <- cbind(X, interaction = X * data@variables[[treatment]]@data)
}
}
}
if(s > 1) start <- y@parameters[s-1,]
else if(s < -1) start <- y@parameters[1,]
else start <- NULL
start <- NULL
weights <- if(length(data@weights) == 1) data@weights[[1]] else data@weights[[y@variable_name]]
CONTROL <- list(epsilon = max(1e-8, exp(-abs(s))), maxit = 25, trace = FALSE)
priors <- data@priors[[y@variable_name]]
out <- bayesglm.fit(X, y@data - 1L, weights = weights,
prior.mean = priors[[1]], prior.scale = priors[[2]], prior.df = priors[[3]],
prior.mean.for.intercept = priors[[4]], prior.scale.for.intercept = priors[[5]],
prior.df.for.intercept = priors[[6]],
start = start, family = y@family, Warning = FALSE, control = CONTROL)
if(warn && !out$converged) {
warning(paste("bayesglm() did not converge for variable", y@variable_name, "on iteration", abs(s)))
}
if(any(abs(coef(out)) > 100)) {
warning(paste(y@variable_name, ": separation on iteration", abs(s)))
}
out$x <- X
class(out) <- c("bayesglm", "glm", "lm")
return(out)
})
.fit_MNL <-
function(y, X, weights) {
model<-nnet::multinom(y@data ~ X -1, weights = weights, Hess = y@imputation_method == "ppd",
model = TRUE, trace = FALSE, MaxNWts = 10000)
return(model)
}
.fit_RNL <-
function(y, X, weights, CONTROL) {
if (y@use_NA==TRUE) values <- c(-.Machine$integer.max, 1:length(y@levels))
else values <- 1:length(y@levels)
out <- sapply(values, simplify = FALSE, FUN = function(l) {
model <- bayesglm.fit(X, y@data == l, weights = weights,
family = y@family, control=CONTROL)
model$x <- X # bayesglm.fit() by default does not retain the model matrix it uses
class(model) <- c("bayesglm", "glm", "lm")
return(model)
})
class(out) <- "RNL"
return(out)
}
setMethod("fit_model", signature(y = "unordered-categorical", data = "missing_data.frame"), def =
function(y, data, warn, s, ...) {
to_drop <- data@index[[y@variable_name]]
if (y@use_NA) {
y@data[y@which_drawn] <- -.Machine$integer.max # make NAs the smallest possible signed integer
}
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
if(is(data, "experiment_missing_data.frame")) {
treatment <- names(which(data@concept == "treatment"))
if(data@concept[y@variable_name] == "outcome") {
X <- cbind(X, interaction = X * data@variables[[treatment]]@data)
}
}
weights <- if(length(data@weights) == 1) data@weights[[1]] else data@weights[[y@variable_name]]
if(y@estimator == "MNL") {
out <- .fit_MNL(y, X, weights)
data@X
}
else if(y@estimator == "RNL"){
CONTROL <- list(epsilon = max(1e-8, exp(-abs(s))), maxit = 25, trace = FALSE)
out <- .fit_RNL(y, X, weights, CONTROL)
data@X
}
else stop("estimator not recognized")
return(out)
})
.clogit <- # similar to the survival::clogit function
function(formula, data, n, method, weights, subset,
x = TRUE, na.action = "na.exclude") {
coxcall <- match.call()
coxcall[[1]] <- as.name("coxph")
newformula <- formula
newformula[[2]] <- substitute(survival::Surv(rep(1, nn), case),
list(case = formula[[2]], nn = n))
environment(newformula) <- environment(formula)
coxcall$formula <- newformula
coxcall$n <- NULL
coxcall <- eval(coxcall, sys.frame(sys.parent()))
coxcall$userCall <- sys.call()
class(coxcall) <- c("clogit", "coxph")
coxcall
}
setMethod("fit_model", signature(y = "grouped-binary", data = "missing_data.frame"), def =
function(y, data, s, warn) {
# see http://www.stata.com/support/faqs/stat/clogitcl.html for a good explanation of this model
to_drop <- data@index[[y@variable_name]]
X <- data@X[,-to_drop]
weights <- if(length(data@weights) == 1) data@weights[[1]] else data@weights[[y@variable_name]]
groups <- sapply(y@strata, FUN = function(x) complete(data@variables[[x]], m = 0L), simplify = FALSE)
out <- .clogit(y@data ~ X + strata(groups), method = "breslow", weights = weights, n = nrow(X))
out$x <- X
return(out)
})
setMethod("fit_model", signature(y = "ordered-categorical", data = "missing_data.frame"), def =
function(y, data, s, warn, X = NULL, ...) {
if(is.null(X)) {
to_drop <- data@index[[y@variable_name]]
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
if(is(data, "experiment_missing_data.frame")) {
treatment <- names(which(data@concept == "treatment"))
if(data@concept[y@variable_name] == "outcome") {
X <- cbind(X, interaction = X * data@variables[[treatment]]@data)
}
}
X <- X[,-1]
}
method <- if(y@family$link == "logit") "logistic" else y@family$link
start <- NULL
start <- c(rep(0, ncol(X)), qlogis(cumsum(table(y@data)) / nrow(X)))
start <- start[-length(start)]
weights <- if(length(data@weights) == 1) data@weights[[1]] else data@weights[[y@variable_name]]
CONTROL <- list(reltol = max(1e-8, exp(-abs(s))))
priors <- data@priors[[y@variable_name]]
out <- bayespolr(as.ordered(y@data) ~ X, weights = weights, method = method,
prior.mean = priors[[1]], prior.scale = priors[[2]], prior.df = priors[[3]],
prior.counts.for.bins = priors[[4]], control = list(reltol = max(1e-8, exp(-abs(s)))), ...)
if(warn && out$convergence != 0) {
warning(paste("bayespolr() did not converge for variable", y@variable_name, "on iteration", abs(s)))
}
out$x <- X
return(out)
})
setMethod("fit_model", signature(y = "interval", data = "missing_data.frame"), def =
function(y, data, s, warn, ...) {
stop("FIXME: write this method")
})
## helper function
.fit_continuous <-
function(y, data, s, warn, X, subset = 1:nrow(X)) {
weights <- if(length(data@weights) == 1) data@weights[[1]] else data@weights[[y@variable_name]]
if(!is.null(weights)) weights <- weights[subset]
if(s > 1) start <- y@parameters[s-1,]
else if(s < -1) start <- y@parameters[1,]
else start <- NULL
start <- NULL
mark <- c(TRUE, apply(X[subset,-1, drop = FALSE], 2, FUN = function(x) length(unique(x)) > 1))
if(!all(mark)) {
if(abs(s) == 1) {
stop(paste(y@variable_name, ": imputed values on iteration 0 randomly inadmissible; try mi() again with different seed"))
}
X <- X[,mark]
if(!is.null(start)) start <- start[mark]
}
CONTROL <- list(epsilon = max(1e-8, exp(-abs(s))), maxit = 25, trace = FALSE)
priors <- data@priors[[y@variable_name]]
out <- bayesglm.fit(X[subset,], y@data[subset], weights = weights, start = start, family = y@family,
prior.mean = priors[[1]], prior.scale = priors[[2]], prior.df = priors[[3]],
prior.mean.for.intercept = priors[[4]], prior.scale.for.intercept = priors[[5]],
prior.df.for.intercept = priors[[6]], Warning = FALSE, control = CONTROL)
if(warn && !out$converged) {
warning(paste("bayesglm() did not converge for variable", y@variable_name, "on iteration", abs(s)))
}
out$x <- X
class(out) <- c("bayesglm", "glm", "lm")
return(out)
}
setMethod("fit_model", signature(y = "continuous", data = "missing_data.frame"), def =
function(y, data, s, warn, ...) {
to_drop <- data@index[[y@variable_name]]
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
if(is(data, "experiment_missing_data.frame")) {
treatment <- names(which(data@concept == "treatment"))
if(data@concept[y@variable_name] == "outcome") {
X <- cbind(X, interaction = X * data@variables[[treatment]]@data)
}
}
return(.fit_continuous(y, data, s, warn, X))
})
# setMethod("fit_model", signature(y = "truncated-continuous", data = "missing_data.frame"), def =
# function(y, data, s, warn, ...) {
# stop("FIXME: write this method using library(survival)")
# })
#
# setMethod("fit_model", signature(y = "censored-continuous", data = "missing_data.frame"), def =
# function(y, data, s, warn, ...) {
# stop("FIXME: mi does not do censored-continuous variables yet")
# to_drop <- data@index[[y@variable_name]]
# X <- cbind(y@raw_data, data@X[,-to_drop])
# if(is(data, "experiment_missing_data.frame")) {
# treatment <- names(which(data@concept == "treatment"))
# if(data@concept[y@variable_name] == "outcome") {
# X <- cbind(X, interaction = X * data@variables[[treatment]]@data)
# }
# }
# })
setMethod("fit_model", signature(y = "semi-continuous", data = "missing_data.frame"), def =
function(y, data, s, warn, ...) {
stop("the semi-continuous fit_model() method should not have been called")
})
setMethod("fit_model", signature(y = "nonnegative-continuous", data = "missing_data.frame"), def =
function(y, data, s, warn, ...) {
to_drop <- data@index[[y@variable_name]]
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
model <- fit_model(y@indicator, data, s, warn, X)
if(abs(s) > 1) subset <- complete(y@indicator, m = 0L, to_factor = TRUE) == 0
else subset <- 1:nrow(X)
return(.fit_continuous(y = y, data = data, s = s, warn = warn, X = X, subset = subset))
})
setMethod("fit_model", signature(y = "SC_proportion", data = "missing_data.frame"), def =
function(y, data, s, warn, ...) {
to_drop <- data@index[[y@variable_name]]
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
model <- fit_model(y@indicator, data, s, warn, X)
if(abs(s) > 1) subset <- complete(y@indicator, m = 0L, to_factor = TRUE) == 0
else subset <- 1:nrow(X)
return(.fit_proportion(y = y, data = data, s = s, warn = warn, X = X, subset = subset))
})
## helper function
.fit_proportion <-
function(y, data, s, warn, X, subset = 1:nrow(X)) {
weights <- if(length(data@weights) == 1) data@weights[[1]] else data@weights[[y@variable_name]]
if(!is.null(weights)) weights <- weights[subset]
if(s > 1) start <- y@parameters[s-1,]
else if(s < -1) start <- y@parameters[1,]
else start <- NULL
start <- NULL
mark <- c(TRUE, apply(X[subset,-1, drop = FALSE], 2, FUN = function(x) length(unique(x)) > 1))
if(!all(mark)) {
if(abs(s) == 1) {
stop(paste(y@variable_name, ": imputed values on iteration 0 randomly inadmissible; try mi() again with a different seed"))
}
X <- X[,mark]
if(!is.null(start)) start <- start[c(mark, TRUE)]
}
out <- betareg::betareg.fit(X[subset,], y@data[subset], weights = if(!is.null(weights)) weights[subset],
link = y@family$link, link.phi = y@link.phi,
control = betareg::betareg.control(reltol = 1e-8, start = start, fsmaxit = 0))
if(warn && !out$converged) {
warning(paste("betareg() did not converge for variable", y@variable_name, "on iteration", abs(s)))
}
out$x <- X
class(out) <- c("betareg")
return(out)
}
setMethod("fit_model", signature(y = "proportion", data = "missing_data.frame"), def =
function(y, data, s, warn, ...) {
to_drop <- data@index[[y@variable_name]]
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
if(y@family$family == "gaussian") out <- .fit_continuous(y, data, s, warn, X)
else out <- .fit_proportion(y, data, s, warn, X)
return(out)
})
setMethod("fit_model", signature(y = "count", data = "missing_data.frame"), def =
function(y, data, s, warn, ...) {
to_drop <- data@index[[y@variable_name]]
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
return(.fit_continuous(y, data, s, warn, X)) # even though counts are not continuous
})
## experiments
setMethod("fit_model", signature(y = "missing_variable", data = "experiment_missing_data.frame"), def =
function(y, data, ...) {
stop("you need to write a specific fit_model() method for the", class(y), "class")
})
setMethod("fit_model", signature(y = "continuous", data = "experiment_missing_data.frame"), def =
function(y, data, s, warn, ...) {
to_drop <- data@index[[y@variable_name]]
## For each case, make an X matrix based on the giant matrix in data@X
if(data@case == "outcomes") { # missingness on outcome(s) only
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
treatment_name <- names(data@concept[data@concept == "treatment"])
X <- cbind(X, interaction = X[,!(colnames(X) %in% c("(Intercept)", treatment_name))] * X[,treatment_name])
}
else if(data@case == "covariates") { # missingness on covariate(s) only
to_drop <- c(to_drop, which(data@concept == "treatment"))
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
}
else { # missing on both outcome(s) and covariate(s)
if(data@concept[y@variable_name] == "covariate") {
to_drop <- c(to_drop, which(data@concept == "treatment"))
}
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
}
return(mi::.fit_continuous(y, data, s, warn, X))
})
## here y indicates which variable to model
setMethod("fit_model", signature(y = "character", data = "mi"), def =
function(y, data, m = length(data@data), ...) {
s <- sum(data@total_iters) + 1
if(length(m) == 1) {
models <- vector("list", m)
for(i in 1:m) {
model <- data@data[[i]]@variables[[y]]@model
if(is.null(model)) {
model <- fit_model(y = data@data[[i]]@variables[[y]], data = data@data[[i]], s = s, warn = TRUE, ...)
if(!isS4(model)) model$x <- model$X <- model$y <- NULL
}
models[[i]] <- model
}
}
else {
models <- vector("list", length(m))
models <- for(i in 1:length(m)) {
if(is.null(data@data[[i]]@variables[[y]]@model)) {
models[[m[i]]] <- fit_model(y = data@data[[m[i]]]@variables[[y]], data = data@data[[m[i]]], s = s, warn = TRUE, ...)
}
else models[[i]] <- data@data[[i]]@variables[[y]]@model
}
}
return(models)
})
## fit all variables with missingness
setMethod("fit_model", signature(y = "missing", data = "mi"), def =
function(data, m = length(data@data)) {
varnames <- names(data@data[[1]]@variables)
exclude <- data@data[[1]]@no_missing |
sapply(data@data[[1]], FUN = function(y) is(y, "irrelevant"))
models <- sapply(varnames, simplify = FALSE, FUN = function(v) {
if(v %in% exclude) paste(v, "not modeled") ## maybe just skip these?
else fit_model(y = v, data = data, m = m)
})
return(models)
})
## fit all elements of a mdf_list
setMethod("fit_model", signature(y = "missing", data = "mdf_list"), def =
function(data, s = -1, verbose = FALSE, warn = FALSE, ...) {
out <- lapply(data, fit_model, s = s, verbose = verbose, warn = warn, ...)
class(out) <- "mdf_list"
return(out)
})
.fit_model_y <-
function(y, data, s, verbose, warn, ...) {
if(s != 0 && y@imputation_method != "mcar") {
if(is(y, "semi-continuous")) {
to_drop <- data@index[[y@variable_name]]
if(length(to_drop)) X <- data@X[,-to_drop]
else X <- data@X[,]
model <- fit_model(y = y@indicator, data = data, s = s, warn = warn, X = X)
indicator <- mi(y = y@indicator, model = model, s = ifelse(s < 0, 1L, s))
if(s > 1) indicator@parameters[s,] <- get_parameters(model)
else if(abs(s) == 1) {
parameters <- get_parameters(model)
rows <- if(s == 1) nrow(indicator@parameters) else 1
if(ncol(indicator@parameters) == 0) {
temp <- matrix(NA_real_, nrow = rows, ncol = length(parameters))
}
temp[1,] <- parameters
indicator@parameters <- temp
}
else indicator@parameters[1,] <- get_parameters(model)
y@indicator <- indicator
}
model <- fit_model(y = y, data = data, s = s, warn = warn)
y <- mi(y = y, model = model, s = ifelse(s < 0, 1L, s))
}
else y <- mi(y = y)
if(y@imputation_method == "mcar") {
# do nothing
}
else if(s > 1) {
parameters <- get_parameters(model)
if(length(parameters) != ncol(y@parameters)) parameters <- y@parameters[s-1,] # scary
y@parameters[s,] <- parameters
}
else if(abs(s) == 1) {
parameters <- get_parameters(model)
rows <- if(s == 1) nrow(y@parameters) else 1
if(ncol(y@parameters) == 0) {
temp <- matrix(NA_real_, nrow = rows, ncol = length(parameters))
}
temp[1,] <- parameters
y@parameters <- temp
}
else if(s != 0) {
parameters <- get_parameters(model)
if(length(parameters) == ncol(y@parameters)) y@parameters[s,] <- parameters
}
return(y)
}
.update_X <-
function(y, data) {
which_drawn <- y@which_drawn
varname <- y@variable_name
if(is(y, "categorical")) {
dummies <- .cat2dummies(y)[which_drawn,,drop = FALSE]
data@X[ which_drawn, data@index[[varname]][1:NCOL(dummies)]] <- dummies
}
else if(is(y, "semi-continuous")) {
mark <- data@index[[varname]]
data@X[ which_drawn, mark[1] ] <- y@data[which_drawn]
dummies <- .cat2dummies(y@indicator)
data@X[ which_drawn, mark[1 + 1:NCOL(dummies)] ] <- dummies[which_drawn,,drop = FALSE]
}
else if(is(y, "censored_continuous")) {
temp <- y@data[which_drawn]
if(y@n_lower) temp <- cbind(temp, lower = y@lower_indicator@data[y@which_drawn])
if(y@n_upper) temp <- cbind(temp, upper = y@upper_indicator@data[y@which_drawn])
data@X[ which_drawn, data@index[[varname]][1:NCOL(temp)]] <- temp
data@X[ y@which_censored, data@index[[varname]][1] ] <- y@data[y@which_censored]
}
else if(is(y, "truncated_continuous")) {
temp <- y@data[which_drawn]
if(y@n_lower) temp <- cbind(temp, lower = y@lower_indicator@data[y@which_drawn])
if(y@n_upper) temp <- cbind(temp, upper = y@upper_indicator@data[y@which_drawn])
data@X[ which_drawn, data@index[[varname]][1:NCOL(temp)]] <- temp
data@X[ y@which_truncated, data@index[[varname]][1] ] <- y@data[y@which_truncated]
}
else data@X[ which_drawn, data@index[[varname]][1] ] <- y@data[which_drawn]
return(data)
}
.fit_model_mdf <-
function(data, s, verbose, warn, ...) {
if(verbose) {
txt <- paste("Iteration:", abs(s))
if(isatty(stdout()) && !(any(search() == "package:gWidgets"))) cat("\n", txt)
else cat("
", txt, file = file.path(data@workpath, "mi.html"), append = TRUE)
}
on.exit(print("the problematic variable is"))
on.exit(show(y), add = TRUE)
for(jj in sample(1:ncol(data), ncol(data), replace = FALSE)) {
y <- data@variables[[jj]]
if(y@all_obs) next
if(is(y, "irrelevant")) next
y <- .fit_model_y(y, data, s, verbose, warn, ...)
data <- .update_X(y, data)
data@variables[[jj]] <- y
if(verbose) {
txt <- "."
if(isatty(stdout()) && !(any(search() == "package:gWidgets"))) cat(txt)
else cat(txt, file = file.path(data@workpath, "mi.html"), append = TRUE)
}
}
if(verbose) {
if(isatty(stdout()) && !(any(search() == "package:gWidgets"))) cat(" ")
else cat("
", file = file.path(data@workpath, "mi.html"), append = TRUE)
}
if(.MI_DEBUG) sapply(data@variables, validObject, complete = TRUE)
on.exit()
return(data)
}
## unlike the above methods, these return a (modified) missing_data.frame
setMethod("fit_model", signature(y = "missing", data = "missing_data.frame"), def =
function(data, s = -1, verbose = FALSE, warn = FALSE, ...) {
return(.fit_model_mdf(data = data, s = s, verbose = verbose, warn = warn, ...))
})
setMethod("fit_model", signature(y = "missing", data = "allcategorical_missing_data.frame"), def =
function(data, s = -1, verbose = FALSE, warn = FALSE, ...) {
if(verbose) {
txt <- paste("Iteration:", abs(s))
if(isatty(stdout()) && !(any(search() == "package:gWidgets"))) cat("\n", txt)
else cat("
", txt, file = file.path(data@workpath, "mi.html"), append = TRUE)
}
Hstar <- data@Hstar
if(abs(s) == 0) { # starting iteration
V_h <- c(runif(Hstar - 1), 1)
c_prod <- cumprod(1 - V_h)
data@parameters$pi <- V_h * c(1, c_prod[-Hstar])
data@variables <- lapply(data@variables, FUN = function(y) {
if(is(y, "irrelevant")) return(y)
if(y@all_obs) return(y)
return(mi(y)) # bootstrapping
})
data@X <- do.call(cbind, args = lapply(data@variables, FUN = function(y) {
if(is(y, "irrelevant")) return(NULL)
else return(y@data)
}))
phi <- lapply(1:Hstar, FUN = function(h) {
lapply(data@variables, FUN = function(y) {
if(is(y, "irrelevant")) return(NULL)
return(c(tabulate(y@data, nbins = length(y@levels)) / y@n_total))
})})
data@parameters$phi <- phi
data@parameters$alpha <- 1
cols <- Hstar
rows <- nrow(data@latents@imputations)
if(ncol(data@latents@parameters) == 0) {
temp <- matrix(NA_real_, nrow = rows, ncol = cols)
}
data@latents@parameters <- temp
return(data)
}
# S1: Update latent class membership
pi <- data@parameters$pi
phi <- data@parameters$phi
probs <- sapply(1:Hstar, FUN = function(h) {
phi_h <- phi[[h]]
numerators <- rep(1, nrow(data))
for(j in 1:ncol(data)) {
y <- data@variables[[j]]
if(is(y, "irrelevant")) next
phi_hj <- phi_h[[y@variable_name]]
numerators <- numerators *
data@X[,y@variable_name] * phi_hj[data@X[,y@variable_name]]
}
numerators <- numerators * pi[h]
return(numerators)
})
z <- apply(probs, 1, FUN = function(prob) {
which(rmultinom(1,1,prob) == 1) # rmultinom normalizes internally
})
data@latents@data[] <- z
data@latents@imputations[s,] <- z
data@latents@parameters[s,] <- pi
# S2: Update V_h
n_h <- c(tabulate(z, nbins = Hstar))
V_h <- sapply( 1:(Hstar - 1), FUN = function(h) {
a <- 1 + n_h[h]
b <- data@parameters$alpha + sum(n_h[-c(1:h)])
if(b == 0) return(1)
rbeta(1, a, b)
})
V_h <- c(V_h, 1)
c_prod <- cumprod(1 - V_h)
data@parameters$pi <- V_h * c(1, c_prod[-Hstar])
# S3: Update choice probabilities
phi <- lapply(1:Hstar, FUN = function(h) lapply(data@variables, FUN = function(y) {
if(is(y, "irrelevant")) return(NULL)
mark <- z == h
tab <- tabulate(y@data[mark], nbins = length(y@levels))
return(.rdirichlet(1, data@priors$a[y@variable_name] + c(tab)))
}))
data@parameters$phi <- phi
# S4: Update alpha
alpha <- rgamma(1, data@priors$a_alpha + Hstar - 1,
data@priors$b_alpha - log(pi[Hstar]))
data@parameters$alpha <- alpha
# S5: Impute
data@variables <- lapply(data@variables, FUN = function(y) {
if(is(y, "irrelevant")) return(y)
if(y@all_obs) return(y)
if(verbose) {
txt <- "."
if(isatty(stdout()) && !(any(search() == "package:gWidgets"))) cat(txt)
else cat(txt, file = file.path(data@workpath, "mi.html"), append = TRUE)
}
classes <- z[y@which_drawn]
uc <- unique(classes)
Pr <- t(sapply(uc, FUN = function(c) phi[[c]][[y@variable_name]]))
rownames(Pr) <- uc
y <- mi(y, Pr[as.character(classes),,drop=FALSE])
})
data@X <- do.call(cbind, args = lapply(data@variables, FUN = function(y) {
if(is(y, "irrelevant")) return(NULL)
else return(y@data)
}))
return(data)
})
.fit_model_Sophie <-
function(y, data, s = -1, verbose = FALSE, warn = FALSE, ...) {
classes <- data@latents@data
uc <- unique(classes)
Pr <- t(sapply(uc, FUN = function(c) data@parameters$phi[[c]][[y@variable_name]]))
rownames(Pr) <- uc
# Pr <- Pr[as.character(classes),,drop=FALSE]
Pr <- Pr / rowSums(Pr)
return(list(fitted = Pr))
}
setMethod("fit_model", signature(y = "unordered-categorical",
data = "allcategorical_missing_data.frame"), def =
function(y, data, s = -1, verbose = FALSE, warn = FALSE, ...) {
return(.fit_model_Sophie(y, data, s, verbose, warn, ...))
})
setMethod("fit_model", signature(y = "ordered-categorical",
data = "allcategorical_missing_data.frame"), def =
function(y, data, s = -1, verbose = FALSE, warn = FALSE, ...) {
return(.fit_model_Sophie(y, data, s, verbose, warn, ...))
})
setMethod("fit_model", signature(y = "binary",
data = "allcategorical_missing_data.frame"), def =
function(y, data, s = -1, verbose = FALSE, warn = FALSE, ...) {
return(.fit_model_Sophie(y, data, s, verbose, warn, ...))
})
setMethod("fit_model", signature(y = "missing_data.frame", data = "missing_data.frame"), def =
function(y, data, s = -1, verbose = FALSE, warn = FALSE, ...) {
if(verbose) {
txt <- paste("Iteration:", abs(s))
if(isatty(stdout()) && !(any(search() == "package:gWidgets"))) cat("\n", txt)
else cat("
", txt, file = file.path(data@workpath, "mi.html"), append = TRUE)
}
for(jj in sample(1:ncol(y), ncol(y), replace = FALSE)) {
z <- y@variables[[jj]]
if(z@all_obs) next
if(is(z, "irrelevant")) next
y@variables[[jj]] <- .fit_model_y(z, data, s, verbose, warn, ...)
if(verbose) {
txt <- "."
if(isatty(stdout()) && !(any(search() == "package:gWidgets"))) cat(txt)
else cat(txt, file = file.path(data@workpath, "mi.html"), append = TRUE)
}
}
if(verbose) {
if(isatty(stdout()) && !(any(search() == "package:gWidgets"))) cat(" ")
else cat("
", file = file.path(data@workpath, "mi.html"), append = TRUE)
}
if(.MI_DEBUG) sapply(data@variables, validObject, complete = TRUE)
return(y)
})
setMethod("fit_model", signature(y = "missing", data = "multilevel_missing_data.frame"), def =
function(data, s = -1, verbose = FALSE, warn = FALSE, ...) {
data@mdf_list <- fit_model(data = data@mdf_list, s = s, verbose = verbose, warn = warn, ...)
if(s == 0) return(data)
## FIXME: Implement 3+ levels recursively
# update group means
means <- sapply(data@mdf_list, FUN = function(x) colMeans(x@X[,-1]))
if(is.list(means)) {
}
else means <- t(means)
mark <- 0L ## FIXME
data@X[,mark] <- means
# impute the group level variables if necessary
data <- .fit_model_mdf(data = data, s = s, verbose = verbose, warn = warn, ...)
# model the individual level estimates
for(i in seq_along(ncol(data))) {
if(is(data@variables[[i]], "irrelevant")) next
# if(data@no_missing[i]) next
mark <- if(s < 0) 1 else s
fish <- sapply(data@mdf_list, FUN = function(d) d@variables[[i]]@parameters[mark,])
if(is.list(fish)) {
## FIXME: may be a list
}
else fish <- t(fish)
for(j in seq_along(ncol(fish))) {
model <- bayesglm.fit(data@X, y = fish[,j]) # group-level regression
class(model) <- c("bayesglm", "glm", "lm")
params <- arm::sim(model, 1)
beta <- params@coef
sigma <- params@sigma
yhats <- rnorm(nrow(data@X), data@X %*% beta, sd = sigma)
# change the priors for each element of the mdf_list accordingly
for(k in seq_along(data@mdf_list)) {
data@mdf_list[[k]]$mean[colnames(data)[j]] <- yhats[k]
data@mdf_list[[k]]$sd[colnames(data)[j]] <- sigma
}
}
}
return(data)
})
mi/R/debug.R 0000644 0001762 0000144 00000002245 12513634171 012277 0 ustar ligges users # Part of the mi package for multiple imputation of missing data
# Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
## FIXME: Make damn sure .MI_DEBUG is FALSE before pushing to CRAN
.MI_DEBUG <- FALSE
## FIXME: Also, make all if(.MI_DEBUG) statements one-liners in other files
## and do sed s/if(.MI_DEBUG)/#if(.MI_DEBUG)/g *.R
if(TRUE && .MI_DEBUG) { # define multi-line debugging functions in here
options(error = recover)
}
mi/R/sysdata.rda 0000644 0001762 0000144 00001053260 12513740445 013234 0 ustar ligges users ý7zXZ i"Þ6 ! ÏXÌæ?çïþ] )TW"änRÊŸ’Øá´7iÈ|MújÞä{¼è/Þgíˆ<óË+£ÖŽ|Æ&cG))=F‡Å0ú&Ÿ…¿Ù²ýñHŸð•€ÓÝÙ-î1þÚÒ)7õ%Ȫǔ¢ÿ9\ÞµÌX»/oî hè][N;TÄÏg›šh–캆Pš.£Ðè¨R`Þã5v²:~xÈ<æIþùð/ñ^´¹¿x’‡¹_¼ Só'¤c°±¥ˆÞÚÒéÉ”aiá½":ñÝýÁâ€ÜеC^O«|¿¿Œzû.©t› Õ,$,Dâ¿ëmGN³¨E£î‡Ê‰„¡¢’“Y?ã‚èçFŒ;ÇM 8,wŸ(*·?(™F|¢µl,¥càëŒ8n$_¤ßñêU–Á"¯~šLâô{Þ¹?ðºÐ²ÕV§%áœiÓÊÊ«èªJ÷"Õ)/ˆý²Þv;,z5óŸ Iøä3´Y¸c“õ3Èâ}ÄÖžÞ:>Ö+…î ¸„aosž,|R¶Œ‘`Tþs`ÌkPr &›–o_ŒQw*õŽVJB?vܾ£Q>(ƪž†Å Pg¥[Øi?6¾¾:å „ü{µN!9Á:ÓƒTnZƼîySOÅ_±Bªlk+¢ Fêd;öêd¯¶”£µ•WbÀ{
¬¾ÉÓ»Ä
Þßlœ\¼•È_Õ!%iâf¶Ô+íÀC$ü£YÚ§è.¸ÀiOç1ìxš²å"òì/kP2– ÈŸQØ›rxÂ_ý¤„ ¥}Ø’%å$’檀l8ËÓ$øl€Ñmb‚ìch›%M}ø>üâ‹>½8p·)qÂe£k7ËxÛ¤Íýj5±rqÑÖ“~ÄP(Cg§TT¾á¬ÿÉžƒ¡}ñ.$ L=~ßiXýæSeá6uÌt\QS͈.½Í"àI8~jg£5¯a3½à5ûØ©¹ÞËŠÎn{JDMù ´§½¬Li$Ã=’Lן5oîHmîHs¹ïÜ//Öø,ê}çׇ̮™Ø!ž¬…'WÙS–¾!ç9¾´=~>kÎÇ’'¥~e‘äüy%ëo¼ðO'9¬Ãl°ó±–ñ‡#O8 ˆIî"£2×™ª1‹L;V©F èáÌu¼%ãÌöÕUäÓJè伨>e\ÐíV¥âÀ=¯]ø±Ö4£0›Ë¹Øé *”z¼ŽºâRÍöÀRµªâ7”s«…j%ñø¶×½µzX^³À™'„ˆÍ¯“[Öç´IÛIÒX뺹M».ö5¬5ÍZ6¡õÛnú9kíÿ;bý‡ëȪ7Àò%ÿñcòP”R(>ò¥]¶~²R‡Y,hôF‘©~ÅÃ?ûX½%ôw™{=t2¯;&Ò ²†cQô‡9·ò.uL”ÇÎQ¾_»Â}?œ“
L^k€O,%Ut>AÍ®jæ(ÁÛ<Ò`åNEv„D/ Œ™9ÿ2©‚2c~bès÷¤êÐnoÿÞÙGùH£™kg¬ïÇIcê`"8Û™^$
®‘s¶ƒï=±ªå¶Œp9Œ&Ï>[ë†eü«å+ŸÎâʵeϯ¦dÒY¯Ù¯ä–UFâã9Í3•1dÑÂÐ_4˜=nä¾ú–w!VZDÞã ¢oÙƒ·wû{E™Þ3¾¡S·†õ¿'\¼TÑÞr|óŽ93&’&žÀ¨y{áP16( ÄÀT½¾÷lÐiùþß_ÚϺÚÇј±Å
;¾
óõÃX©ma×gÁ¤ Ûåß:²–N”Nö¨C§¸^âø#ÑÁúž<4F޽+DïʈssCŠu³l‹UqkàM˜^×;ÀéÊJZñY
戈‚‰Í%ÕwhÒ‡×)-©‘#׳”dy\ØLçÝœçK¿þ…1ùXhg’/þ ‹ZÛíU5cuiS
u¬I¢“ü0yêl
Q”²ÿcß:¶ÕQܶӊ¬µO(Yž'½.3Fä~ØÈ Ç/7nx«‹ÔØ„çñ\fG.!+ïæý€íÀþíbê¤`1Í
u¬‚§þkõÈx$:ÅÀ¬ˆ¨.'ž³§‘Ý.æû$|»W˜?wI–æeüærÕö={'jÌäÛ¡3
¦Õä(‡nÈnËC=w™zæZÜAî©;Ÿ×°ËÄ—¶¬Ôh:÷êÍ ¿õKJ•‡.t´£ñ‰º0IÐ%MÎ9kkä ÷òp_êïQľñ“´×]íz6¬1C´ò|A<‡ýk>ªÙ®a›=l±+É”
ð>%·z2ås¢¾Ã3˜d°˜çSø„2±?ùþ‹>ó§O_¯þ,¶-÷kSe޶2$†×øŽV-ñ¦ü΄1ðÏ2.Bí¼ìšN¥ŸÉAj¤:Öc&GÚD/cô†qèâÖbWå'!A°DNlîOu/)@ÑýþèµÓÆß2€5[é0Ï›]bY*?˜+k¬×ÜK, >>ú^¤v.J²²É1®Vó‡W t§z’9ä1'z©‡cPÔøqÚ×2
Æâ›Ã.$G—RaŸ…H©,Q›ÇxÿúZéò:Âð±·:Õz¢„^EWÚcrÂzŒ?È£’Sxçi>5“n>ëRÉ¿î#ÈŠ‡W%×!IÕ(ã^1”‘- Ï=|ï„jCw&³ÍìcCv~xúx½ãO1ã¡Å\[C¹.ejv£ŒÏï„ɦ Ð6þñŠFÆæcRcqVƒ&ÌôÚ+|Šö
\ã}é<7—×›ÍO‰ï*æuV&ði'õI˜/àxfvçš9h“Ä»sƒµuƒ ;È£GøÍ’Žìc/Ê,ïäµáKF0¹~kã¶ÝÔ4µ»I
Ú]Ki‡f†f¨]*Ä»-<$çƒR"û“’¦Öm-/"©kÞrrekøÚd£å+þŠœ|3˜½Á¬œËJ”ÈãâYiÉðAg•lV¥A±Œ†ø6ÇøS3>åæ q$ö°® ¡¿ˆC)è'Ö²É\ÕÊËtäêËäEC0òÌv-¢Ä"Ô¥$pþ2Av¿ {ËA•òçòa¬ÿÕÛ5M¥JÖÆ)ûÏüOË~àá}™wªŸô²q'~g 0(usAƒ‘á—rñ
‰ÀÍÁ]aÄúPƒ*ä—L†L3¦ö³êŸ‚