# Analysis of REAL and ABM simulation data 

library(data.table)
library(dplyr)
library(magrittr)
library(car)
library(emmeans)
library(lme4)
library(proxy)
library(tidyr)
library(ggplot2)
library(ggthemes)
library(dtt)
library(Tmisc)
library(optimx)


rootLogDir <- "/vdata/Projects/ABM/simulations/Michele/Antarctica" # change manually
simulationName <- "ABM20181204185316" # change manually
# logDir is the directory where you extracted data/Antarctica_results.zip
logDir <- file.path(rootLogDir, simulationName)
# logDir <- "~/faldone/work/Munich/scambio_temp"

# real data + all runs collected here
# ABM session 1 based on production after 5000 interactions
REALandABM.df <- fread(file.path(logDir, "REALandABM.production.df"))
REALandABM.df[, `:=`(speaker = factor(speaker),
                     session = session %>% as.character %>% as.numeric, # this is needed in the lmer  
                     origin = factor(origin, levels = c("REAL", "ABM")))]
REALandABM.df <- REALandABM.df[valid == TRUE, .(word, label, speaker, session, origin, Run, P1, P2, P3, P4, P5, P6)]
# REALandABM.df columns:
# word: the word containing the target phoneme
# label: the target phoneme
# speaker: integer identifying the speaker or the correspongind agent
# session: 0/1 = before/in Antarctica, applies both to real and to simulated data
# origin: REAL = actual speech, ABM = simulated speech
# Run: which of the 100 independent ABM simulations, from 1 to 100. 
# 0 indicates data at session 0, that is identical for all runs, while from 1 to 100 is session 1 for ABM.
# For REAL data Run = NA. Note that sessino 0 is identical for REAL and ABM, since all ABM runs are initialised
# with sessino 0 REAL. 
# P1 .. P6: F1 DCT-0, F1 DCT-1, F1 DCT-2, F2 DCT-0, F2 DCT-1, F2 DCT-2

# Note: session has to be coded as numeric (integer) for later use in lmer
# see: https://stats.stackexchange.com/questions/122336/coding-of-categorical-random-effects-in-r-int-vs-factor

# All features
Pcols <- paste0("P", 1:6)
# F2 acoustic features are used to compute distances
F2cols <-  paste0("P", 4:6) # features used to compute /*u/ distance from i: (F2)

# centroid of i: computed on F2cols
i.mean.F2cols <- REALandABM.df[origin == "REAL" & label == "i:",
                            lapply(.SD, mean), .SDcols = F2cols]

# the phonemes of interest
phonemes <- c("u", "ju", "ou", "I:")

# compute distance to i:
REALandABM.df[label %in% phonemes,
              euclDist_i := proxy::dist(.SD, i.mean.F2cols),
              .SDcols = F2cols]

# multiple comparison correction (Dunn–Šidák)
dunnsidak = function(Ncomp, alpha = .05)
{
  1 - (1 - alpha)^(1/Ncomp)
}

### Global effects of session

# lmer stats

# lmer on REAL data

lmer.formula <- "euclDist_i ~ session + (1|word) + (session|speaker)"
REAL.lmer <- sapply(phonemes, function(ph) {
  lmer(as.formula(lmer.formula),
       data = REALandABM.df[origin == "REAL" & label == ph],
        control=lmerControl(optimizer = "optimx", optCtrl=list(method = "nlminb"))
       )
}, USE.NAMES = TRUE)
# get fixed effects and their significance:
sapply(REAL.lmer, fixef)
# stats reported in caption of Fig. 1
sapply(REAL.lmer, Anova)
# Only the model for ou shows a significant effect on session, i.e. ou exhibits an innovation.

### Analysis of simulations, i.e. ABM data

##### ABM Run-specific lmer 

require(parallel)
numCores <- detectCores()
if (.Platform$OS.type == "unix") {
  cl <- makeCluster(numCores, type = "FORK")
} else if (.Platform$OS.type == "windows") {
  cl <- makeCluster(numCores, type = "PSOCK")
  clusterEvalQ(cl, {
    library(lme4)
    library(optimx)
    library(data.table)
  })
  clusterExport(cl, c("phonemes", "REALandABM.df", "lmer.formula"))
}
clusterSetRNGStream(cl)
ABM.Runs.lmer <- parSapply(cl, phonemes, function(ph) {
  sapply(1:100, function(Run_){
    lmer(as.formula(lmer.formula),
         data = REALandABM.df[origin == "ABM" & label == ph & Run %in% c(0, Run_)],
         control=lmerControl(optimizer = "optimx", optCtrl=list(method = "nlminb"))
    )
  }, USE.NAMES = TRUE, simplify = FALSE)
}, USE.NAMES = TRUE, simplify = FALSE)
stopCluster(cl)  

#Using aggregate means (Option 1)

### Global effects of session
aggregateChange.ABM <- REALandABM.df[origin == "ABM" & session == 1 & label %in% phonemes,
                                     .(session.1 = mean(euclDist_i)), by = .(label, Run)] %>%
  .[REALandABM.df[origin == "ABM" & session == 0 & label %in% phonemes,
                  .(session.0 = mean(euclDist_i)), by = .(label, Run)], on = "label"] %>%
  .[, Change := session.1 - session.0] 

aggregateChange.ABM[Change < 0, .N, by = label]


# Using lmer (Option 2.1)

Session.ABM.Runs.lmer <- rbindlist(sapply(phonemes, function(ph) {
  rbindlist(sapply(1:100, function(Run_) {
    lmer_ <- ABM.Runs.lmer[[ph]][[Run_]]
    data.table(fixed.eff = lmer_ %>% summary %>% .$coefficients %>% .["session", "Estimate"],
               p.val = lmer_ %>% Anova(type = "III") %>% .["session",'Pr(>Chisq)'])
  }, USE.NAMES = TRUE, simplify = FALSE), idcol = "Run")
}, USE.NAMES = TRUE, simplify = FALSE), idcol = "Phoneme")

# How many of the 100 runs exhibit significant session effect and on which phonemes?
Session.ABM.Runs.lmer[Phoneme %in% phonemes & fixed.eff < 0, .N, by = Phoneme]

Session.ABM.Runs.lmer[p.val < dunnsidak(4), .N, by = Phoneme]


######### Analysis of speakers/agents

### Analysis of speakers, i.e. REAL data

# Correlation between positions (e.g. distance from i:) before Antarctica (session 0)
# and changes in antarctica (session 1 - session 0) by speaker

# code speakers with letters
Spk <- LETTERS[REALandABM.df$speaker %>% unique %>% as.character %>% as.numeric]
Spk[Spk %in% c("C", "I", "L")] <- c("O", "M", "X")
Spk <- data.table(speaker = REALandABM.df$speaker %>% unique %>% sort %>% as.character,
                  Spk = Spk)

# Option 1: Using aggregate data (i.e. no lmer)

# Compute positions at session 0, 1
Position.REAL <- REALandABM.df[origin == "REAL" & label %in% phonemes,
                               .(Position = mean(euclDist_i)),
                               by = .(speaker, label, session)]
# Rearrange, keep session.0 and session.1 - session.0 = change
Position.REAL %<>%
  spread(session, Position, sep = '.') %>%
  setDT %>%
  .[, change := session.1 - session.0] %>%
  .[, session.1 := NULL]


# Compute correlations, expected to be negative
Cor.REAL <- Position.REAL %>%
  .[, {cor.test_ <- cor.test(session.0, change, method = "pearson", alternative = "less");
  .(p.value = cor.test_$p.value, cor = cor.test_$estimate)},
  by = label]
Cor.REAL
# .95 conf. level corrected for 4 comparisions (phonemes) is 0.9872585
# Which correlations are significant at that level?
Cor.REAL[p.value < dunnsidak(4), label]

# Option 2: using the lmer models:

# Random intercept and slopes for speaker, i.e. the (session|speaker) term,
# are strongly correlated to the aggregate values of session 0 and session 1 - 0 
# of Option 1 above, respectively. 
# Their correlation can be computed directly or from the model. 

# Option 2.1 direct computation of correlation:

# extract random intercept and slopes for speaker
RandomCoefs.REAL <- rbindlist(lapply(phonemes, function(ph) {
  REAL.lmer[[ph]] %>% ranef %>% .[["speaker"]] %>%
    setDT(keep.rownames = TRUE) %>% .[, label := ph]
})) %>% setnames(c("rn", "session"), c("speaker", "Slope") )

# compute correlation as in Option 1:
Cor.RandomCoefs.REAL <- RandomCoefs.REAL %>%
  .[, {cor.test_ <- cor.test(`(Intercept)`, Slope, method = "pearson", alternative = "less");
  .(p.value = cor.test_$p.value, cor = cor.test_$estimate)},
  by = label]
Cor.RandomCoefs.REAL
Cor.RandomCoefs.REAL[p.value < dunnsidak(4), label]
# Correlation values are similar to Cor.REAL
# This is because of the high correlation between session 0 and intercept, and session 1 - 0 and slope, resp.:
Position.REAL[RandomCoefs.REAL, on = c("speaker", "label")] %>%
  .[, .(Cor.session.0.Intercept = cor(session.0, `(Intercept)`),
        Cor.change.Slope = cor(change, Slope)),
    by = label]

# Option 2.2: use the corr estimates directly from the lmer models:

# Correlations between random intercept and slope
RandomCoefs.REAL.lmer <- sapply(phonemes, function(ph) {
REAL.lmer[[ph]] %>%
  VarCorr %>%
  as.data.frame %>%
  setDT %>%
  .[grp == "speaker" & var1 == "(Intercept)" &  var2 == "session", sdcor]
}, USE.NAMES = TRUE)

# 95% confidence interval of above correlations:
Confint.REAL.lmer <- sapply(phonemes, function(ph) {
  REAL.lmer[[ph]] %>%
  confint.merMod(level = 1 - dunnsidak(4),
                 oldNames = FALSE,
                 parm = "cor_session.(Intercept)|speaker",
                 method = "boot")
}, USE.NAMES = TRUE)
# values similar to Cor.RandomCoefs.REAL


RandomCoefs.REAL %>%
  .[label == "I:", label := "ɪ:"]

####### Fig. 2
ggplot(RandomCoefs.REAL[Spk, on = "speaker"]) +
  aes(x = `(Intercept)`, y = Slope) +
  stat_smooth(method="lm", se=FALSE,  color = 'grey') +
  geom_text(aes(label = Spk)) +
  facet_grid( ~ label) +
  xlab("By-winterer intercept") +
  ylab("By-winterer slope") +
  theme_igray() +
  theme(text = element_text(size = 12))


########## formant trajectories in time

# inverse DCT function
inv_dct_from_emuR <- function(X, N = 11) {
  0.5*(sqrt(2) - 1) * X[1] + dtt::dct(c(X, rep(0, N - length(X))), variant = 3)
}

# compute F2 trajectories
N_samples <- 11
Run_ <- 27 # a random run
REALandABM.traj <- REALandABM.df[(origin == "REAL" | Run == Run_)  & label %in% phonemes, ] %>%
  .[, ID := 1:.N] %>%
  .[, .(F2 = inv_dct_from_emuR(.SD %>% as.numeric, N = N_samples),
        time = seq(0, 1, length.out = N_samples)),
    by = .(session, label, origin, ID),
    .SDcols = F2cols
    ] %>%
  unite("Modality", origin, session) %>%
  .[, Modality := factor(Modality, levels = c("REAL_0", "REAL_1", "ABM_1"),
                         labels = c("Baseline", "In Antarctica", "ABM"))] %>%
  .[label == "I:", label := "ɪ:"]

### Fig. 1
ggplot(REALandABM.traj) +
  aes(time, F2, group = Modality, color = Modality, linetype = Modality, fill = Modality) +
  stat_summary(geom = "line", fun.y = mean) +
  stat_summary(geom = "ribbon", fun.data = mean_cl_normal, alpha = 0.3) +
  facet_grid(~ label) +
  theme_igray() +
  scale_fill_grey(start = 0.8, end = 0.2) +
  scale_color_grey(start = 0.8, end = 0.2) +
  scale_linetype_manual(values=c("solid", "dotted", "dashed")) + 
  theme(text = element_text(size = 12)) +
  xlab("Normalized time") +
  ylab("Normalized F2") +
  theme(legend.title = element_blank())

### Analysis of agents

# This part follows the same procedure used for REAL data, applying Option 2.1 for correlations.

# random slopes for speaker
ABM.Runs.coefs <- rbindlist(sapply(phonemes, function(ph) {
  rbindlist(sapply(1:100, function(Run_) {
  ABM.Runs.lmer[[ph]][[Run_]] %>% ranef %>% .[["speaker"]] %>%
    setDT(keep.rownames = TRUE)
}, USE.NAMES = TRUE, simplify = FALSE), idcol = "Run")
}, USE.NAMES = TRUE, simplify = FALSE), idcol = "Phoneme") %>%
  setnames("rn", "speaker") 


ABM.Runs.cor <- ABM.Runs.coefs[, {
  cor.test_ <- cor.test(`(Intercept)`, session, method = "pearson", alternative = "l")
  .(cor = cor.test_$estimate, p.value  = cor.test_$p.value)
},
by = .(Phoneme, Run)]
# How many negative and significant?
ABM.Runs.cor[cor < 0 & p.value < dunnsidak(400), .N, by = Phoneme]
# median cor
ABM.Runs.cor[, median(cor), by = Phoneme]


# This part follows the same procedure used for REAL data, applying Option 2.2 for correlations.

# Correlations between random intercept and slope
Cor.ABM.Runs.lmer <- rbindlist(sapply(phonemes, function(ph) {
  rbindlist(sapply(1:100, function(Run_) {
    ABM.Runs.lmer[[ph]][[Run_]] %>%
      VarCorr %>%
      as.data.frame %>%
      setDT %>%
      .[grp == "speaker" & var1 == "(Intercept)" &  var2 == "session", .(Cor = sdcor)]
  }, USE.NAMES = TRUE, simplify = FALSE), idcol = "Run")
}, USE.NAMES = TRUE, simplify = FALSE), idcol = "Phoneme")


require(parallel)
numCores <- detectCores() 
if (.Platform$OS.type == "unix") {
  cl <- makeCluster(numCores, type = "FORK")
} else if (.Platform$OS.type == "windows") {
  cl <- makeCluster(numCores, type = "PSOCK")
  clusterEvalQ(cl, {
    library(lme4)
    library(data.table)
  })
  clusterExport(cl, c("phonemes", "ABM.Runs.lmer", "lmer.formula"))
}
clusterSetRNGStream(cl)

Confint.ABM.Runs.lmer <- rbindlist(parSapply(cl, phonemes, function(ph) {
  rbindlist(sapply(1:100, function(Run_) {
    ABM.Runs.lmer[[ph]][[Run_]] %>%
      confint.merMod(level = 1 - dunnsidak(4),
                     oldNames = FALSE,
                     parm = "cor_session.(Intercept)|speaker",
                     method = "boot") %>% # use "profile" if "boot" is too slow
      as.data.frame
  }, USE.NAMES = TRUE, simplify = FALSE), idcol = "Run")
}, USE.NAMES = TRUE, simplify = FALSE), idcol = "Phoneme")
stopCluster(cl)  
# rename upper/lower bounds (the last two columns)
Confint.ABM.Runs.lmer %>% setnames((ncol(.)-1):ncol(.), c("LB", "UB"))

# join the two tables
Cor.ABM.Runs.lmer %<>% .[Confint.ABM.Runs.lmer, on = c("Phoneme", "Run")]

# How many corr are not negative?
Cor.ABM.Runs.lmer[Cor >= 0, .N, by = Phoneme]
# How many conf. int. upper bounds are above 0, i.e. corr not significantly negative?
Confint.ABM.Runs.lmer[UB >= 0, .N, by = Phoneme]      
# Median of corr conf int upper bound:
Confint.ABM.Runs.lmer[, median(UB), by = Phoneme]


########## Similarity between REAL and ABM slopes (expected positive)
# Computed using Option 2.1

ABM.Runs.slopes.cor <- ABM.Runs.coefs[RandomCoefs.REAL, on = c("speaker", Phoneme = "label")] %>%
  .[, {
    cor.test_ <- cor.test(Slope, session, method = "pearson", alternative = "g")
    .(cor = cor.test_$estimate, p.value  = cor.test_$p.value)
  },
  by = .(Phoneme, Run)]
# How many positive and significant?
ABM.Runs.slopes.cor[cor > 0 & p.value < dunnsidak(4), .N, by = Phoneme]
# median cor
ABM.Runs.slopes.cor[, median(cor), by = Phoneme]

######### Fig. 3
# Plot one representative scatterplot + correlation
ABM.Runs.coefs %>%
  .[Phoneme == "I:", Phoneme := "ɪ:"]

ggplot(
  ABM.Runs.coefs %>%
    .[Run == Run_,] %>%
    .[RandomCoefs.REAL, on = c("speaker", Phoneme = "label")] %>%
    .[Spk, on = "speaker"]
) +
  aes(y = session, x = Slope) +
  stat_smooth(method="lm", se=FALSE,  color = 'grey') +
  geom_text(aes(label = Spk)) +
  facet_grid( ~ Phoneme) +
  xlab("By-winterer slope") +
  ylab("By-agent slope") +
  theme_igray() +
  theme(text = element_text(size = 12))
