###############################################################
## Load packages + code that implements constraints in lavaan
###############################################################
library("xtable")
library("ggplot2")
library("lavaan")
library("mirt")
source("altcoding.R")


###############################################################
## Compute empirical Bayes estimates for all response patterns
## for 5 ordered categories and 2-10 observed variables.
###############################################################
## NB for this code chunk only, the p and K variables are inadvertently reversed from their use in the rest of the paper.
##    (that is, p is number of categories per ordinal variable; K is number of ordinal variables)
p <- 5
K <- 2:10

optfun <- function(pars, y, p, invar = NULL, eb = TRUE) {
  if (is.null(invar)) invar <- 1/p

  logl <- rep(NA, length(y))

  lot <- hit <- rep(NA, length(y))

  for (i in 1:length(y)) {
    if (y[i] == 1) {
      lot[i] <- -Inf
    } else {
      lot[i] <- (y[i] - .5 - pars)
    }
    if (y[i] == p) {
      hit[i] <- Inf
    } else {
      hit[i] <- (y[i] + .5 - pars)
    }
  }

  probs <- pnorm(hit) - pnorm(lot)

  if (eb) {
    diff <- pars - (p + 1)/2
    pen <- 0.5 * invar * diff * diff # full form is 0.5 * diff %*% V %*% t(diff)
  } else {
    pen <- 0
  }
    
  pen - sum( log(probs), na.rm = TRUE)
}


lvp <- vector("list", length(K))
for (i in 1:length(K)) {
  rlist <- lapply(1:K[i], function(x) 1:p)
  rpatt <- expand.grid(rlist)
  ordpatt <- apply(rpatt, 1, function(x) paste0(x[ order(x) ], collapse = ""))
  rpatt <- as.matrix(rpatt[!duplicated(ordpatt),])

  tmpout <- matrix(NA, nrow(rpatt), 4)
  for (j in 1:nrow(rpatt)) {
    tmp <- nlminb(p/2, optfun, y = as.numeric(rpatt[j,]), p = p, eb = TRUE)
    tmpout[j,] <- c(mean(rpatt[j,]), tmp$par, tmp$convergence, sum(rpatt[j,] == 1 | rpatt[j,] == p))
  }
  lvp[[i]] <- list(res = tmpout, rpatt = rpatt)
}

res <- do.call("rbind", lapply(1:length(lvp), function(i) cbind(lvp[[i]]$res, K[i])))
res <- as.data.frame(res); names(res) <- c("avg", "lv", "converge", "nextreme", "K")


###############################################################
## Figure 1
###############################################################
res$K <- paste0("p = ", res$K)
res$K <- factor(res$K, levels = paste0("p = ", 2:10))
ggplot(res, aes(x = avg, y = lv, colour = nextreme > 0)) + geom_point() + facet_wrap(~ K) + geom_abline() + theme(legend.position = "none", panel.background = element_rect(fill = "white"), axis.line = element_line(colour = "black")) + xlab("Observed average") + ylab("Empirical Bayes estimate")


###############################################################
## Additional code showing that the gradient of the likelihood
## function is always close to 0 at the mean of observed
## variables, so long as we avoid extremes.
###############################################################
## optimization function
optfun <- function(pars, y) {
  logl <- rep(NA, length(y))

  lot <- hit <- rep(NA, length(y))

  for (i in 1:length(y)) {
    lot[i] <- (y[i] - .5 - pars)
    hit[i] <- (y[i] + .5 - pars)
  }

  probs <- pnorm(hit) - pnorm(lot)

  -sum( log(probs), na.rm = TRUE)
}

## show that gradient equals 0 for non-extreme response patterns
gradfun <- function(resp, mnresp = NULL) {
  if (is.null(mnresp)) mnresp <- mean(resp)

  lo <- resp - .5 - mnresp
  hi <- resp + .5 - mnresp

  sum( (dnorm(lo) - dnorm(hi)) / (pnorm(hi) - pnorm(lo)) )
}

parms <- expand.grid(p = 3:6, k = 3:5)
grads <- vector("list", nrow(parms))
for (i in 1:nrow(parms)) {
  p <- parms$p[i]
  k <- parms$k[i]
  its <- lapply(1:p, function(x) 1:k)
  rpatts <- do.call("expand.grid", its)
  ## get nonduplicated, nonextreme response patterns
  ordpatt <- apply(rpatts, 1, function(x) paste0(x[order(x)], collapse = ""))
  rpatts <- rpatts[!duplicated(ordpatt),]
  nonextr <- apply(rpatts, 1, function(x) all(x > 1 & x < k))
  rpatts <- rpatts[nonextr,]

  grads[[i]] <- apply(rpatts, 1, gradfun)
}

summary(sapply(grads, max))
## all close to 0:
##     Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
##0.0000000 0.0000000 0.0005314 0.0014647 0.0024970 0.0052645


###############################################################
## Simulation section
##   The code below loads results and produces tables and
##   graphs. For code that runs the simulation, see the
##   simulation/ subfolder.
###############################################################
load("R/figures/manuscript_revision_simple_figures.RData")
load("R/figures/manuscript_revision_simple_tables.RData")


## Figure 2
p2


## Table 1
print(xtable(tab_chisq1, fig.align = "center",
             caption = "Proportion replications with middling response pattern resulting in identical fit across identification constraint methods.", align = "lllcccccc",
             label = "tab:chisq"), size = "footnotesize", 
      add.to.row = addtorow1, include.colnames = FALSE, include.rownames = FALSE)



## Table 2
print(xtable(tab_chisq3, fig.align = "center",
             caption = "Proportion replications with middling response pattern, six indicators, and three response categories resulting in best fit across identification constraint methods.", align = "llccccccccc",
             label = "tab:chisq2"), size = "footnotesize", 
      add.to.row = addtorow3, include.colnames = FALSE, include.rownames = FALSE)



###############################################################
## Example 1 (using the attitudes toward science dataset)
###############################################################
data(Science, package = 'ltm')
scord <- Science
## convert from character to numeric
for (i in 1:ncol(Science)) {
  scord[,i] <- as.numeric(factor(Science[,i], levels = c("strongly disagree", "disagree", "agree", "strongly agree")))
}
names(scord) <- tolower(names(scord))

## response patterns
rpat <- apply(scord, 1, paste0, collapse = "")
summary(factor(rpat))


## Table 3
freqs <- apply(scord, 2, table)

print(xtable(freqs, fig.align = "center",
             caption = "Item response frequencies of the attitudes toward science dataset.", align = "rccccccc",
             label = "tab:datfreq"), size = "footnotesize")


## Estimating models in lavaan and extracting results
m1 <- paste('f1 =~', paste0(names(scord), collapse = " + "))

tradfit <- cfa(m1, data = scord, ordered = TRUE, std.lv = TRUE, parameterization = "theta")
tradlvs <- lavPredict(tradfit)

newsyn <- altcoding(m1, data = scord, ordered = TRUE, parameterization = "theta")
ourfit <- lavaan(newsyn, data = scord, ordered = TRUE, parameterization = "theta")
ourlvs <- lavPredict(ourfit)

## check:
## cbind(fitMeasures(tradfit)[1:5], fitMeasures(ourfit)[1:5])

chis <- fitMeasures(tradfit)[c('chisq', 'df', 'rmsea')]

tmats <- lavInspect(tradfit, 'est')
omats <- lavInspect(ourfit, 'est')

tse <- sqrt(diag(vcov(tradfit)))
ose <- sqrt(diag(vcov(ourfit)))
ose[is.na(ose)] <- 0L


## Table 4
tlse <- paste0("(", formatC(tse[grep("=~", names(tse))], digits = 2, format = "f"), ")")
olse <- paste0("(", formatC(ose[grep("ld", names(ose))], digits = 2, format = "f"), ")")
tab <- rbind(formatC(tmats$lambda[,1], digits = 2, format = "f"), tlse,
             formatC(omats$lambda[,1], digits = 2, format = "f"), olse)
rownames(tab) <- c("Trad est", "\\hspace{.1in} (SE)", "Int est", "\\hspace{.1in} (SE) ")

print(xtable(tab, fig.align = "center",
             caption = "Comparison of loading estimates and SEs under traditional constraints and under the integer constraints.", align = "lccccccc",
             label = "tab:loadcomp"), size = "footnotesize", sanitize.text.function = function(x){x})


## Table 5
ttse <- paste0("(", formatC(tse[grep("|", names(tse), fixed = TRUE)], digits = 2, format = "f"), ")")
test <- matrix(paste("$", formatC(tmats$tau[,1], digits = 2, format = "f"), "$ ", ttse), 3, 7)

otse <- paste0("(", formatC(ose[grep("^[a-g].", names(ose))], digits = 2, format = "f", zero.print = "--"), ")")
oest <- matrix(paste("$", formatC(omats$tau[,1], digits = 2, format = "f"), "$ ", otse), 3, 7)

tab <- rbind(test, rep("", ncol(test)), oest)
colnames(tab) <- names(scord)
rownames(tab) <- c("Trad est (SE)", "", " ", "  ", "Int est (SE)", "    ", "     ")

print(xtable(tab, fig.align = "center",
             caption = "Comparison of threshold estimates and SEs under traditional constraints and under the integer constraints.", align = "lccccccc",
             label = "tab:threshcomp"), size = "scriptsize", sanitize.text.function = function(x){x})


## Figure 3
plot(rowMeans(scord), ourlvs, pch = 20, xlim = c(1,4), ylim = c(1,4),
     xlab = "Average score", ylab = "Empirical Bayes estimate"); abline(0,1)


###############################################################
## Example 2 (using Facebook survey data)
###############################################################
## Read data and select variables
dat <- read.csv("nps_data/NPS_Publication_Data.csv")
varnums <- c(2,4,5,9,10)

## complete data
compl <- which(rowSums(is.na(dat[, 2:11])) == 0)


## define new item type for mirt
P.integer <- function(par, Theta, ncat) {
  tau1 <- par[1]
  tau2 <- par[2]
  tauvec <- c(-Inf, 1.5, tau1, tau2, 4.5, Inf)
  lambda <- par[ncat - 2]
  nu <- par[ncat - 1]
  residvar <- par[ncat]

  out <- lapply(Theta, function(x) pnorm(tauvec[2:(ncat + 1)], mean = nu + lambda * x, sd = sqrt(residvar)) - pnorm(tauvec[1:ncat], mean = nu + lambda * x, sd = sqrt(residvar)))

  do.call("rbind", out)
}

intitem <- createItem("intitem", par = c(tau1 = 2.5, tau2 = 3.5, lambda = .7, nu = .1, residvar = .5), est = rep(TRUE, 5), P = P.integer)


## fit the model in mirt (with constraints and etc)
mymodel <- mirt.model("F = 1-5\n COV = F*F\n MEAN = F")

## intermediate step to obtain parameter numbers:
##fit <- mirt(dat[, paste0("FB.BEN_", varnums)], mymodel, rep("intitem", length(varnums)), customItems = list(intitem = intitem), pars = "values")
##fit

## sum constraints on loadings and intercepts
myconst <- function(pars, optim_args) {
  c(sum(pars[c(4, 9, 14, 19, 24)]),
    sum(pars[c(3, 8, 13, 18, 23)]))
}
solnp_args <- list(eqfun = myconst, eqB = c(0, 5),
                   LB = c(rep(c(1.5, 1.5, 1e-3, -2, 1e-4), 5), 1, 1e-4),
                   UB = c(rep(c(4.5, 4.5, 3, 2, 5), 5), 5, 5))

## create mirt object to define starting values
fit <- mirt(dat[compl, paste0("FB.BEN_", varnums)], mymodel, rep("intitem", length(varnums)), customItems = list(intitem = intitem), optimizer = 'solnp', solnp_args = solnp_args, pars = "values")
fit$value <- c(rep(c(2.5, 3.5, .9, .1, .25), 5), 3, .5)

## estimate model
fit2 <- mirt(dat[compl, paste0("FB.BEN_", varnums)], mymodel, rep("intitem", length(varnums)), customItems = list(intitem = intitem), pars = fit, optimizer = 'solnp', solnp_args = solnp_args)

c2stats <- M2(fit2, type = 'C2')


## Table 6
parest <- do.call("rbind", coef(fit2)[1:5])
rownames(parest) <- paste0("Item ", 1:5)
colnames(parest) <- c("Tau2", "Tau3", "Lambda", "Nu", "Theta")

print(xtable(parest, fig.align = "center",
             caption = "Item parameter estimates for Example 2.", align = "lccccc",
             label = "tab:ex2est"), size = "scriptsize", sanitize.text.function = function(x){x})


## Figure 4
ourlvs <- fscores(fit2, method = "MAP")
plot(rowMeans(dat[compl, paste0("FB.BEN_", varnums)]), ourlvs, pch = 20, xlim = c(1,5), ylim = c(1,5),
     xlab = "Average score", ylab = "Empirical Bayes estimate"); abline(0,1)


## Computing expected average score
params <- coef(fit2)
lambda <- matrix(sapply(params[1:5], function(x) x[,'lambda']), ncol = 1)
taumat <- do.call("rbind", lapply(params[1:5], function(x) c(1.5, x[1:2], 4.5)))
mats <- list(lambda = lambda, theta = diag(5), nu = matrix(0, nrow = 5))

K <- 5
nvar <- nrow(mats$lambda)
taumat <- cbind(-Inf, taumat, Inf)
eta <- seq(1, K, .1)
expected <- rep(NA, length(eta))
for (i in 1:length(eta)) {
  ystar <- with(mats, nu + lambda * eta[i])

  tmpprob <- matrix(NA, nvar, K)
  for (j in 1:nvar) {
    tmpprob[j,] <- pnorm(taumat[j,2:(K+1)], mean = ystar[j], sd = sqrt(mats$theta[j,j])) -
      pnorm(taumat[j,1:K], mean = ystar[j], sd = sqrt(mats$theta[j,j]))
  }

  expected[i] <- sum(apply(tmpprob, 1, function(x) sum(x * (1:K)))) / nvar
}


## Figure 5
par(mfrow = c(1,2))

plot(eta, expected, type = "l", xlab = "Latent variable", ylab = "Expected average score", xlim = c(1,5), ylim = c(1,5))
abline(0, 1, lty = 2)

plot(eta, expected, type = "l", xlab = "Latent variable", ylab = "Expected/observed average score", xlim = c(1,5), ylim = c(1,5))
abline(0, 1, lty = 2)
points(ourlvs, jitter( rowMeans(dat[compl, paste0("FB.BEN_", varnums)]) ))


## Tables and figures in Appendix B
load("R/figures/manuscript_revision_default_figures.RData")
load("R/figures/manuscript_revision_default_tables.RData")


## Figure B1
p2d


## Table B1
print(xtable(tab_chisq1d, fig.align = "center",
             caption = "Proportion replications with middling response pattern resulting in identical fit across identification constraint methods, using default starting values.", align = "lllcccccc",
             label = "tab:chisqdef"), size = "footnotesize", 
      add.to.row = addtorow1d, include.colnames = FALSE, include.rownames = FALSE)



## Table B2
print(xtable(tab_chisq3d, fig.align = "center",
             caption = "Proportion replications with middling response pattern, six indicators, and three response categories resulting in best fit across identification constraint methods, using default starting values.", 
             align = "llccccccccc",
             label = "tab:chisq2def"), 
      size = "footnotesize", 
      add.to.row = addtorow3d, 
      include.colnames = FALSE, 
      include.rownames = FALSE)



## Table B3
print(xtable(tab_chisq2, fig.align = "center",
             caption = "Proportion replications with symmetric or skewed response pattern resulting in identical fit across identification constraint methods, using simple starting values.", 
             align = "llllcccccc",
             label = "tab:chisq_supp"), 
      size = "footnotesize", 
      add.to.row = addtorow2, 
      include.colnames = FALSE, 
      include.rownames = FALSE, 
      table.placement = NULL)



## Table B4
print(xtable(tab_chisq2d, fig.align = "center",
             caption = "Proportion replications with symmetric or skewed response pattern resulting in identical fit across identification constraint methods, using default starting values.", 
             align = "llllcccccc",
             label = "tab:chisq_supp_def"), 
      size = "footnotesize", 
      add.to.row = addtorow2d, 
      include.colnames = FALSE, 
      include.rownames = FALSE, 
      table.placement = NULL)



## Table B5
print(xtable(tab_chisq4, fig.align = "center",
             caption = "Proportion replications with middling response pattern and three indicators per factor resulting in best fit across identification constraint methods, using simple starting values.", align = "lllccccccccc",
             label = "tab:supp2"), size = "scriptsize",
      add.to.row = addtorow4, include.colnames = FALSE, include.rownames = FALSE,table.placement = NULL)



## Table B6
print(xtable(tab_chisq4d, fig.align = "center",
             caption = "Proportion replications with middling response pattern and three indicators per factor resulting in best fit across identification constraint methods, using default starting values.", align = "lllccccccccc",
             label = "tab:supp2_def"), size = "scriptsize",
      add.to.row = addtorow4d, include.colnames = FALSE, include.rownames = FALSE,table.placement = NULL)



## Table B7
print(xtable(tab_chisq5, fig.align = "center",
             caption = "Proportion replications with middling response pattern and six indicators per factor resulting in best fit across identification constraint methods, using simple starting values.", align = "lllccccccccc",
             label = "tab:supp3"), size = "scriptsize",
      add.to.row = addtorow4, include.colnames = FALSE, include.rownames = FALSE,table.placement = NULL)



## Table B8
print(xtable(tab_chisq5d, fig.align = "center",
             caption = "Proportion replications with middling response pattern and six indicators per factor resulting in best fit across identification constraint methods, using simple starting values.", align = "lllccccccccc",
             label = "tab:supp3_def"), size = "scriptsize",
      add.to.row = addtorow4d, include.colnames = FALSE, include.rownames = FALSE,table.placement = NULL)
