Daten & Packages laden

Laden Sie die folgenden Packages und Data-Frames:

library(tidyverse)
library(broom)
urla = "https://www.phonetik.uni-muenchen.de/studium_lehre/"
urlb = "lehrmaterialien/R_speech_processing/Rdf"
url = paste0(urla, urlb)
plosiv.df = read.table(file.path(url, "plosiv.df.txt"), 
                       stringsAsFactors = T)
alter.df = read.table(file.path(url, "alter.df.txt"), 
                      stringsAsFactors = T)
fremd = read.table(file.path(url, "fremd.df.txt"),
                   stringsAsFactors = T)
nasals.df = read.table(file.path(url, "nasals.df.txt"), 
                      stringsAsFactors = T)

Fragen

# Testen ob within oder between
plosiv.df %>%
  select(Vpn, K) %>%
  table()
##      K
## Vpn   p t
##   S1  1 1
##   S10 1 1
##   S11 1 1
##   S12 1 1
##   S13 1 1
##   S14 1 1
##   S15 1 1
##   S2  1 1
##   S3  1 1
##   S4  1 1
##   S5  1 1
##   S6  1 1
##   S7  1 1
##   S8  1 1
##   S9  1 1
# Bild
plosiv.df %>%
  pivot_wider(names_from = K, 
              values_from = dB) %>%
  mutate(d = t - p) %>%
  ggplot +
  aes(y = d) +
  geom_boxplot() 

plosiv.df %>%
  pivot_wider(names_from = K, 
              values_from = dB) %>%
  mutate(d = t - p) %>% 
  pull(d) %>%
  t.test()
## 
##  One Sample t-test
## 
## data:  .
## t = 3.1357, df = 14, p-value = 0.007296
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   5.330174 28.403160
## sample estimates:
## mean of x 
##  16.86667

Die Artikulationsstelle hatte einen signifikanten Einfluss auf die Energiewerte (t[14] = 3.1, p < 0.01).

Der Boxplot zeigt eine Ausreißer, und es könnte der Fall sein, dass die Differenzwerte nicht normalverteilt sind. Um zu testen, ob diese Differenzwerte einer Normalverteilung folgen, kann man einen Shapiro-Test anwenden. Wenn die p-Werte von einem Shapiro-Test unter 0.05 liegen, dann sind die Werte eher nicht normalverteilt sind. Ein Shapiro-Test kann auf diese Weise auf die Differenzwerte angewendet werden:

plosiv.df %>%
  pivot_wider(names_from = K, 
              values_from = dB) %>%
  mutate(d = t - p) %>%
  pull(d) %>%
  shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.87748, p-value = 0.04352

Es gibt zwei Möglichkeiten. Entweder mit einem Wilcoxon signed rank test fortfahren, der keine Normalverteilung voraussetzt:

plosiv.df %>%
  pivot_wider(names_from = K, 
              values_from = dB) %>%
  mutate(d = t - p) %>%
  pull(d) %>%
  wilcox.test()
## Warning in wilcox.test.default(.): cannot compute exact p-value with ties
## Warning in wilcox.test.default(.): cannot compute exact p-value with zeroes
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  .
## V = 90.5, p-value = 0.01854
## alternative hypothesis: true location is not equal to 0

oder - solange man das begründen kann! - entfernt man den Ausreißer:

plosiv.df %>%
  pivot_wider(names_from = K, 
              values_from = dB) %>%
  mutate(d = t - p) %>%
  # Ausreißer bei -40 entfernen
  filter(d > -20) %>%
  # testen ob jetzt normalverteilt ist:

  pull(d) %>%
  shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.93631, p-value = 0.3731
# Alles OK, daher:
plosiv.df %>%
  pivot_wider(names_from = K, 
              values_from = dB) %>%
  mutate(d = t - p) %>%
  # Ausreißer bei -40 entfernen
  filter(d > -20) %>%
  pull(d) %>%
  t.test()
## 
##  One Sample t-test
## 
## data:  .
## t = 5.5256, df = 13, p-value = 9.781e-05
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  12.74600 29.11114
## sample estimates:
## mean of x 
##  20.92857
alter.df %>%
  select(Vpn, Alter) %>%
  table()
##      Alter
## Vpn   alt jung
##   S1    1    0
##   S10   1    0
##   S11   0    1
##   S12   0    1
##   S13   0    1
##   S14   0    1
##   S15   0    1
##   S16   0    1
##   S17   0    1
##   S18   0    1
##   S19   0    1
##   S2    1    0
##   S20   0    1
##   S21   0    1
##   S22   0    1
##   S3    1    0
##   S4    1    0
##   S5    1    0
##   S6    1    0
##   S7    1    0
##   S8    1    0
##   S9    1    0
alter.df %>%
  ggplot +
  aes(y = grund, x = Alter) +
  geom_boxplot()

alter.df %>%
  t.test(grund ~ Alter, .)
## 
##  Welch Two Sample t-test
## 
## data:  grund by Alter
## t = -1.7012, df = 14.769, p-value = 0.1098
## alternative hypothesis: true difference in means between group alt and group jung is not equal to 0
## 95 percent confidence interval:
##  -39.041910   4.408576
## sample estimates:
##  mean in group alt mean in group jung 
##           77.10000           94.41667
t.test(grund ~ Alter, alter.df)
## 
##  Welch Two Sample t-test
## 
## data:  grund by Alter
## t = -1.7012, df = 14.769, p-value = 0.1098
## alternative hypothesis: true difference in means between group alt and group jung is not equal to 0
## 95 percent confidence interval:
##  -39.041910   4.408576
## sample estimates:
##  mean in group alt mean in group jung 
##           77.10000           94.41667

Es gibt höchstens eine (nicht signifikante) Tendenz, dass f0 von Alter beeinflusst wird.

Das Alter hatte keinen signifikanten Einfluss auf die Grundfrequenz.

Wenn man bei ungepaarten Daten prüfen will, ob die Werte normalverteit sind, muss shapiro.test getrennt auf die beiden Stufen angewendet werden:

alter.df %>%
  filter(Alter == "alt") %>%
  pull(grund) %>%
  shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.92184, p-value = 0.3725
alter.df %>%
  filter(Alter == "jung") %>%
  pull(grund) %>%
  shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.9523, p-value = 0.6708
# oder beide auf einmal, und nur die p-Werte zeigen:

alter.df %>%
  group_by(Alter) %>%
  summarise(p.value = shapiro.test(grund)$p.value) %>%
  ungroup()
## # A tibble: 2 × 2
##   Alter p.value
##   <fct>   <dbl>
## 1 alt     0.373
## 2 jung    0.671
fremd %>%
  select(Vpn, Sprache) %>%
  table()
##      Sprache
## Vpn   foreign native
##   S1        1      1
##   S10       1      1
##   S11       1      1
##   S12       1      1
##   S13       1      1
##   S14       1      1
##   S15       1      1
##   S16       1      1
##   S17       1      1
##   S18       1      1
##   S19       1      1
##   S2        1      1
##   S20       1      1
##   S21       1      1
##   S22       1      1
##   S23       1      1
##   S24       1      1
##   S25       1      1
##   S26       1      1
##   S27       1      1
##   S28       1      1
##   S29       1      1
##   S3        1      1
##   S30       1      1
##   S4        1      1
##   S5        1      1
##   S6        1      1
##   S7        1      1
##   S8        1      1
##   S9        1      1
fremd %>%
  pivot_wider(names_from = Sprache, 
              values_from = tempo) %>%
  mutate(d = foreign - native) %>%
  ggplot +
  aes(y = d) +
  geom_boxplot()

# testen, ob normalverteilt sind:
fremd %>%
  pivot_wider(names_from = Sprache, 
              values_from = tempo) %>%
  mutate(d = foreign - native) %>% 
  pull(d) %>%
  shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  .
## W = 0.95534, p-value = 0.2344
fremd %>%
  pivot_wider(names_from = Sprache, 
              values_from = tempo) %>%
  mutate(d = foreign - native) %>% 
  pull(d) %>%
  t.test()
## 
##  One Sample t-test
## 
## data:  .
## t = -5.3138, df = 29, p-value = 1.06e-05
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -6.264775 -2.782559
## sample estimates:
## mean of x 
## -4.523667
mon = c(26, 26, 26, 28, 18, 21, 19, 25, 23, 29, 22, 22, 24, 23, 22)

und die VOT-Werte für /d/ von 15 deutsch-französisch bilingualen Sprechern waren:

bil = c(18, 20, 20, 26, 17, 23, 20, 16, 20, 18, 21, 29, 20, 25, 19)

Prüfen Sie anhand einer Abbildungen und eines statistisches Tests, ob VOT aufgrund der Sprachkenntnisse beeinflusst wird.

Hier muss ein Data-Frame gebaut werden:

vot = c(mon, bil)
Sprecher = paste0("S", 1:30)
Ling = c(rep("mon", 15), rep("bil", 15))
d.df = data.frame(vot, Sprecher, L = Ling)

Jetzt der Test:

d.df %>%
  ggplot +
  aes(y = vot, x = L) +
  geom_boxplot()

d.df %>%
  t.test(vot ~ L, .)
## 
##  Welch Two Sample t-test
## 
## data:  vot by L
## t = -2.2972, df = 27.529, p-value = 0.02943
## alternative hypothesis: true difference in means between group bil and group mon is not equal to 0
## 95 percent confidence interval:
##  -5.298726 -0.301274
## sample estimates:
## mean in group bil mean in group mon 
##              20.8              23.6
t.test(vot ~ L, d.df)
## 
##  Welch Two Sample t-test
## 
## data:  vot by L
## t = -2.2972, df = 27.529, p-value = 0.02943
## alternative hypothesis: true difference in means between group bil and group mon is not equal to 0
## 95 percent confidence interval:
##  -5.298726 -0.301274
## sample estimates:
## mean in group bil mean in group mon 
##              20.8              23.6
nasals.df %>%
  # /nt, nd/ wählen
  filter(coda %in% c("nt", "nd")) %>%
  # mittlen: pro Sprecher einen Wert für /nt/
  # einen für /nd/
  group_by(speaker, coda) %>%
  summarise(m = mean(endv_time)) %>%
  ungroup() %>%
# Differenz pro Sprecher zwischen /nt, nd/
  pivot_wider(names_from = coda, values_from=m) %>%
  mutate(d = nd - nt) %>%
  # Bild
  ggplot +
  aes(y = d) +
  geom_boxplot()
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.

# t-test
nasals.df %>%
  filter(coda %in% c("nt", "nd")) %>%
  group_by(speaker, coda) %>%
  summarise(m = mean(endv_time)) %>%
  ungroup() %>%
  pivot_wider(names_from = coda, values_from=m) %>%
  mutate(d = nd - nt) %>%
  pull(d) %>%
  t.test()
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.
## 
##  One Sample t-test
## 
## data:  .
## t = 14.227, df = 42, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.03970309 0.05282872
## sample estimates:
##  mean of x 
## 0.04626591

VOT wird von dem Unterschied zwischen /nt, nd/ signifikant (t[42] = 14.2, p < 0.001) beeinflusst.

nasals.df %>%
  # /nt/ wählen
  filter(coda == "nt") %>%
  # mittlen: pro Sprecher einen Wert für /nt/
  # mit  Kodierung für language
  group_by(speaker, language) %>%
  summarise(m = mean(endv_time)) %>%
  ungroup() %>%
  ggplot +
  aes(y = m, x = language) +
  geom_boxplot()
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.

# t.test
nasals.df %>%
  # /nt/ wählen
  filter(coda == "nt") %>%
  # mittlen: pro Sprecher einen Wert für /nt/
  # mit  Kodierung für language
  group_by(speaker, language) %>%
  summarise(m = mean(endv_time)) %>%
  ungroup() %>%
  t.test(m ~ language, .)
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.
## 
##  Welch Two Sample t-test
## 
## data:  m by language
## t = -2.7217, df = 27.554, p-value = 0.01113
## alternative hypothesis: true difference in means between group BRE and group USE is not equal to 0
## 95 percent confidence interval:
##  -0.045438286 -0.006397182
## sample estimates:
## mean in group BRE mean in group USE 
##         0.2488794         0.2747971

Die /nt/-Dauer wurde signifikant (t[27.55] = 2.72, p < 0.05) vom Dialekt beeinflusst.