If not already done, carry out parts 1-5 of the setup, as described here.
Load this library:
library(tidyverse)
Load the following objects:
= "http://www.phonetik.uni-muenchen.de/~jmh/lehre/Rdf"
url = read.table(file.path(url, "asp.txt"))
asp = read.table(file.path(url, "intdauer.txt"))
int = read.table(file.path(url, "vdata.txt")) vdata
This module is concerned principally with how to extract observations and variables from data-frames and how to modify them.
This is about extracting particular observations and/or variables from a data-frame. Here are some of the main operations for doing so.
Extracting observations:
# the first observation (row) of asp
%>% slice(1) asp
## d Wort Vpn Kons Bet
## 1 26.18 Fruehlingswetter k01 t un
# observations 5, 6, and 8
%>% slice(c(5, 6, 8)) asp
## d Wort Vpn Kons Bet
## 5 42.380 Tisch k01 t be
## 6 21.560 Mutter k01 t un
## 8 78.125 Kaffee k01 k be
# observations 10 through 14
%>% slice(10:14) asp
## d Wort Vpn Kons Bet
## 10 45.94 Teller k01 t be
## 11 64.13 Broetchenkorb k01 k be
## 12 48.94 keinen k01 k be
## 13 59.00 Kuchen k01 k be
## 14 33.19 Butter k01 t un
# all observations of int except observation 2
%>% slice(-2) int
## Vpn dB Dauer
## 1 S1 24.50 162
## 3 S2 38.02 223
## 4 S2 28.38 131
## 5 S1 23.47 67
## 6 S2 37.82 169
## 7 S2 30.08 81
## 8 S1 24.50 192
## 9 S1 21.37 116
## 10 S2 25.60 55
## 11 S1 40.20 252
## 12 S1 44.27 232
## 13 S1 26.60 144
## 14 S1 20.88 103
## 15 S2 26.05 212
# all observations of int except observations 2 and 4
%>% slice(-c(2, 4)) int
## Vpn dB Dauer
## 1 S1 24.50 162
## 3 S2 38.02 223
## 5 S1 23.47 67
## 6 S2 37.82 169
## 7 S2 30.08 81
## 8 S1 24.50 192
## 9 S1 21.37 116
## 10 S2 25.60 55
## 11 S1 40.20 252
## 12 S1 44.27 232
## 13 S1 26.60 144
## 14 S1 20.88 103
## 15 S2 26.05 212
# the first 4 observations of int in reverse row order
%>% slice(4:1) int
## Vpn dB Dauer
## 4 S2 28.38 131
## 3 S2 38.02 223
## 2 S2 32.54 120
## 1 S1 24.50 162
# the first observation of int
# either
%>% slice(1) int
## Vpn dB Dauer
## 1 S1 24.5 162
# or
%>% slice_head(n=1) int
## Vpn dB Dauer
## 1 S1 24.5 162
# the first 7 observations of int
# either
%>% slice(1:7) int
## Vpn dB Dauer
## 1 S1 24.50 162
## 2 S2 32.54 120
## 3 S2 38.02 223
## 4 S2 28.38 131
## 5 S1 23.47 67
## 6 S2 37.82 169
## 7 S2 30.08 81
# or
%>% slice_head(n=7) int
## Vpn dB Dauer
## 1 S1 24.50 162
## 2 S2 32.54 120
## 3 S2 38.02 223
## 4 S2 28.38 131
## 5 S1 23.47 67
## 6 S2 37.82 169
## 7 S2 30.08 81
# the last observation of asp
%>% slice_tail(n=1) asp
## d Wort Vpn Kons Bet
## 2892 51.937 Verkehrt kko k be
# the last 4 observations of asp
%>% slice_tail(n=4) asp
## d Wort Vpn Kons Bet
## 2889 38.249 Koennten kko k un
## 2890 24.940 vormittags kko t un
## 2891 21.930 Richtung kko t un
## 2892 51.937 Verkehrt kko k be
# Three randomly selected observations from int
%>% slice_sample(n = 3) int
## Vpn dB Dauer
## 7 S2 30.08 81
## 8 S1 24.50 192
## 14 S1 20.88 103
The output of any of the above can be stored as a separate data-frame. Thus to store the output of the last command as the data-frame int_1.df
:
= int %>%
int1_df slice_sample(n = 3)
int1_df
## Vpn dB Dauer
## 15 S2 26.05 212
## 9 S1 21.37 116
## 11 S1 40.20 252
The above code has carriage-returns after the pipe. This is not obligatory (but makes reading the code easier). So exactly the same is given by:
= int %>% slice_sample(n = 3)
int1_df int1_df
## Vpn dB Dauer
## 12 S1 44.27 232
## 11 S1 40.20 252
## 7 S2 30.08 81
Sometimes, it’s useful to write a comment on what the separate lines of code actually do, thus:
# Store in int1_df...
=
int1_df # ...three randomly selected observations from `int`
%>%
int slice_sample(n = 3)
# and list the output
int1_df
## Vpn dB Dauer
## 6 S2 37.82 169
## 2 S2 32.54 120
## 7 S2 30.08 81
Extracting variables is best done by identifying their names. Recall from the previous module that the variable names are given by:
%>% names() asp
## [1] "d" "Wort" "Vpn" "Kons" "Bet"
and they can also be seen with:
%>% head() asp
## d Wort Vpn Kons Bet
## 1 26.180 Fruehlingswetter k01 t un
## 2 23.063 Gestern k01 t un
## 3 26.812 Montag k01 t un
## 4 14.750 Vater k01 t un
## 5 42.380 Tisch k01 t be
## 6 21.560 Mutter k01 t un
The extraction of one or more variables can be done with select()
:
# create a new data-frame asp1
# whose only variable is `Kons`
= asp %>% select(Kons)
asp1 # look at the first three observations
%>% slice_head(n=3) asp1
## Kons
## 1 t
## 2 t
## 3 t
# as above, but select the variables `d` and `Kons`
= asp %>% select(d, Kons)
asp2 %>% head() asp2
## d Kons
## 1 26.180 t
## 2 23.063 t
## 3 26.812 t
## 4 14.750 t
## 5 42.380 t
## 6 21.560 t
# make a data-frame that excludes the variable `Kons`
= asp %>% select(-Kons)
asp3 head(asp3)
## d Wort Vpn Bet
## 1 26.180 Fruehlingswetter k01 un
## 2 23.063 Gestern k01 un
## 3 26.812 Montag k01 un
## 4 14.750 Vater k01 un
## 5 42.380 Tisch k01 be
## 6 21.560 Mutter k01 un
# make a data-frame that excludes the variables `Kons` and `Bet`
= asp %>% select(-Kons, -Bet)
asp4 head(asp4)
## d Wort Vpn
## 1 26.180 Fruehlingswetter k01
## 2 23.063 Gestern k01
## 3 26.812 Montag k01
## 4 14.750 Vater k01
## 5 42.380 Tisch k01
## 6 21.560 Mutter k01
The extraction of observations for selected variables can be done in two separate stages. For example:
# make a new data-frame from `asp`
# of observations 1:15 for the variables `d` and `Kons`
= asp %>% slice(1:15)
asp5 # and extract variables `d` and `Kons` from `asp5`
= asp5 %>% select(d, Kons)
asp6 # verify there are 15 rows and 2 variables in `asp6`
%>% dim() asp6
## [1] 15 2
But a much more efficient way of doing the above is to make repeated use of the pipe operator %>%
so that the output of one command is passed to another. Thus the above is equivalently given by:
# make a new data-frame from `asp`
# of observations 1:15 for the variables `d` and `Kons` only
= asp %>%
asp7 slice(1:15) %>%
select(d, Kons)
The following functions can be used to extract observations with lowest and highest values:
# the observation with the lowest duration
%>% slice_min(d) asp
## d Wort Vpn Kons Bet
## 1565 5.25 Sonntags k11 t un
# the observation with the highest duration
%>% slice_max(d) asp
## d Wort Vpn Kons Bet
## 2063 138.81 Kiel k70 k be
# the five observations with the lowest duration
%>% slice_min(d, n = 5) asp
## d Wort Vpn Kons Bet
## 1565 5.250 Sonntags k11 t un
## 1170 5.690 maechtig K62 t un
## 1548 6.500 Vater k10 t un
## 2507 6.570 unterbrechen k62 t un
## 540 6.688 Mutter k61 t un
filter()
A conditional selection is when information is to be extracted from a data-frame depending on certain conditions. For example, the following are conditional statements:
asp
if the duration is greater than 50 msasp
for t
consonants (if the variable Kons
is t
)asp
in the above case and for unstressed consonants (if the Bet
variable is be
)asp
only if the above three conditions are met.Conditionality in R is defined with so-called logical operators. They return either TRUE
or FALSE
i.e. a logical vector
which was briefly introduced here in the last module.
Some of the most important of these are give below. These can be illustrated by making two simple vectors consisting of some durations and the corresponding vowels:
# a numeric vector of the durations of 6 vowels
= c(120, 85, 100, 65, 75, 150)
d # a character vector of the corresponding vowel labels
= c("a", "i", "e", "i", "u", "a") v
This can be applied to both numeric and character vectors. The output returns logical vectors whose elements are TRUE whenever the condition is met, and FALSE otherwise. Thus:
# which durations are equal to 120?
== 120 d
## [1] TRUE FALSE FALSE FALSE FALSE FALSE
# which vowels are "i"?
== "i" v
## [1] FALSE TRUE FALSE TRUE FALSE FALSE
# which durations are not equal to 65?
!= 65 d
## [1] TRUE TRUE TRUE FALSE TRUE TRUE
# which vowels are not "i"?
!= "i" v
## [1] TRUE FALSE TRUE FALSE TRUE TRUE
These are only applied to numeric vectors:
# which durations are greater than 100 ms?
> 100 d
## [1] TRUE FALSE FALSE FALSE FALSE TRUE
# which durations are greater than or equal to 100 ms?
>= 100 d
## [1] TRUE FALSE TRUE FALSE FALSE TRUE
# which durations are less than 90 ms?
< 90 d
## [1] FALSE TRUE FALSE TRUE TRUE FALSE
# which durations are less than or equal to 80 ms?
<= 80 d
## [1] FALSE FALSE FALSE TRUE TRUE FALSE
This is usually applied to character vectors.
# which vowels are "i"?
# either
== "i" v
## [1] FALSE TRUE FALSE TRUE FALSE FALSE
# or
%in% "i" v
## [1] FALSE TRUE FALSE TRUE FALSE FALSE
# which vowels are either "i" or "u"?
%in% c("i", "u") v
## [1] FALSE TRUE FALSE TRUE TRUE FALSE
# which vowels are neither "i" nor "u"?
!v %in% c("i", "u")
## [1] TRUE FALSE TRUE FALSE FALSE TRUE
Use the and operator &
if one or more conditionals apply:
# which are the "i" vowels with durations
# less than 70 ms?
# (This is the same as: which vowels
# are "i" AND which vowels are less than 70 ms?)
== "i" & d < 70 v
## [1] FALSE FALSE FALSE TRUE FALSE FALSE
# which durations are between 80 and 120 ms.
# (This is the same as:
# which durations are greater than 80 ms
# AND which durations are less than 120 ms?)
> 80 & d < 120 d
## [1] FALSE TRUE TRUE FALSE FALSE FALSE
Use the or operator |
if either one or another conditional applies:
# which vowels are either "i" or "u"?
== "i" | v == "u" v
## [1] FALSE TRUE FALSE TRUE TRUE FALSE
# note that this is the same as the command used earlier:
%in% c("i", "u") v
## [1] FALSE TRUE FALSE TRUE TRUE FALSE
Commands of the above kind are applied inside the filter()
function to extract information from data-frames conditionally. Thus to answer the earlier questions:
# extract all observations from `asp`
# if the duration is greater than 50 ms
= asp %>%
asp8 filter(d > 50)
# extract all observations from `asp` for 't'
# consonants (if `Kons` is 't')
= asp %>%
asp9 filter(Kons == "t")
# extract all observations from `asp` in the
# above case and for unstressed consonants (if `Bet` is "be")
= asp %>%
asp10 filter(Kons == "t" & Bet == "be")
# Alternatively, use two pipes:
= asp %>%
asp10b filter(Kons == "t") %>%
filter(Bet == "be")
# extract all observations from `asp` only
# if the three conditions above are met.
= asp %>%
asp11 filter(d > 50 & Kons == "t" & Bet == "be")
# Alternatively, use three pipes:
= asp %>%
asp11b filter(d > 50) %>%
filter(Kons == "t") %>%
filter(Bet == "be")
mutate()
This can be done using the mutate()
function. For example, the data frame asp
has duration values in the variable d
:
head(asp)
## d Wort Vpn Kons Bet
## 1 26.180 Fruehlingswetter k01 t un
## 2 23.063 Gestern k01 t un
## 3 26.812 Montag k01 t un
## 4 14.750 Vater k01 t un
## 5 42.380 Tisch k01 t be
## 6 21.560 Mutter k01 t un
Suppose the aim is to add another variable of the logarithm of the durations. The function for obtaining (natural) logarithm values is log()
, thus:
# natural log. of 16
log(16)
## [1] 2.772589
The following creates another variable logduration
with values of the (natural) log. of duration:
= asp %>%
asp12 mutate(logduration = log(d))
head(asp12)
## d Wort Vpn Kons Bet logduration
## 1 26.180 Fruehlingswetter k01 t un 3.264996
## 2 23.063 Gestern k01 t un 3.138230
## 3 26.812 Montag k01 t un 3.288850
## 4 14.750 Vater k01 t un 2.691243
## 5 42.380 Tisch k01 t be 3.746677
## 6 21.560 Mutter k01 t un 3.070840
So asp12
now is the same as asp
but with an additional variable logduration
.
Adding another variable can also be done conditionally in different ways. Suppose that for the data frame int
the task is to add another variable called noise
such that any dB value less than 25 dB is quiet, anything from 25 to 35 dB is average and everything over 35 dB is loud:
= int %>%
int2 mutate(noise =
case_when(dB < 25 ~ "quiet",
>=25 & dB < 35 ~ "average",
dB >=35 ~ "loud")
dB
)head(int2)
## Vpn dB Dauer noise
## 1 S1 24.50 162 quiet
## 2 S2 32.54 120 average
## 3 S2 38.02 223 loud
## 4 S2 28.38 131 average
## 5 S1 23.47 67 quiet
## 6 S2 37.82 169 loud
If the conditional is binary, then the ifelse()
function can be used. In the following example, a variable sex
in the data-frame int
is created whose values are M
when the variable Vpn
is speaker S1
and F
for all other speakers (for speaker S2
since there are only two speakers):
= int %>%
int3 mutate(sex = ifelse(Vpn == "S1", "M", "F"))
head(int3)
## Vpn dB Dauer sex
## 1 S1 24.50 162 M
## 2 S2 32.54 120 F
## 3 S2 38.02 223 F
## 4 S2 28.38 131 F
## 5 S1 23.47 67 M
## 6 S2 37.82 169 F
The above is equivalent to:
= int %>%
int3 mutate(sex = case_when(Vpn == "S1" ~ "M",
TRUE ~ "F")
)head(int3)
## Vpn dB Dauer sex
## 1 S1 24.50 162 M
## 2 S2 32.54 120 F
## 3 S2 38.02 223 F
## 4 S2 28.38 131 F
## 5 S1 23.47 67 M
## 6 S2 37.82 169 F
The meaning of TRUE ~ "F"
is everything else becomes “F” – i.e. everything left over after the assignment of “M” becomes “F”.
rename()
The function rename()
can be used to rename the variables of a data-frame. Recall that the variable names for the data-frame asp
are:
%>% names() asp
## [1] "d" "Wort" "Vpn" "Kons" "Bet"
To rename e.g. Wort
, Kons
, and Bet
as Word
, C
, and Stress
respectively:
= asp %>%
aspnew rename(Word = Wort,
C = Kons,
Stress = Bet)
%>% names() aspnew
## [1] "d" "Word" "Vpn" "C" "Stress"
arrange()
, relocate()
The function arrange()
can be used to rearrange the observations of a data-frame in ascending or descending order relative to a particular variable. For example:
# rearrange the observations from the minimum
# to maximum in the variable `duration`
%>% arrange(Dauer) int
## Vpn dB Dauer
## 10 S2 25.60 55
## 5 S1 23.47 67
## 7 S2 30.08 81
## 14 S1 20.88 103
## 9 S1 21.37 116
## 2 S2 32.54 120
## 4 S2 28.38 131
## 13 S1 26.60 144
## 1 S1 24.50 162
## 6 S2 37.82 169
## 8 S1 24.50 192
## 15 S2 26.05 212
## 3 S2 38.02 223
## 12 S1 44.27 232
## 11 S1 40.20 252
# as above but from maximum to minimum
%>% arrange(desc(Dauer)) int
## Vpn dB Dauer
## 11 S1 40.20 252
## 12 S1 44.27 232
## 3 S2 38.02 223
## 15 S2 26.05 212
## 8 S1 24.50 192
## 6 S2 37.82 169
## 1 S1 24.50 162
## 13 S1 26.60 144
## 4 S2 28.38 131
## 2 S2 32.54 120
## 9 S1 21.37 116
## 14 S1 20.88 103
## 7 S2 30.08 81
## 5 S1 23.47 67
## 10 S2 25.60 55
Rearranging of rows for multiple variables is also possible:
# Sort alphabetically by Vpn and then sort Dauer within
# each unique Vpn label:
%>% arrange(Vpn, Dauer) int
## Vpn dB Dauer
## 5 S1 23.47 67
## 14 S1 20.88 103
## 9 S1 21.37 116
## 13 S1 26.60 144
## 1 S1 24.50 162
## 8 S1 24.50 192
## 12 S1 44.27 232
## 11 S1 40.20 252
## 10 S2 25.60 55
## 7 S2 30.08 81
## 2 S2 32.54 120
## 4 S2 28.38 131
## 6 S2 37.82 169
## 15 S2 26.05 212
## 3 S2 38.02 223
relocate()
can be used to rearrange the order in which variables occur:
%>% slice(1) vdata
## X Y F1 F2 dur V Tense Cons Rate Subj
## 1 52.99 4.36 313 966 106.9 % - P a bk
# Relocate variable Subj to column 1
%>% relocate(Subj) %>% slice(1) vdata
## Subj X Y F1 F2 dur V Tense Cons Rate
## 1 bk 52.99 4.36 313 966 106.9 % - P a
# Relocate variables Subj and Cons to columns 1-2
%>% relocate(Subj, Cons) %>% slice(1) vdata
## Subj Cons X Y F1 F2 dur V Tense Rate
## 1 bk P 52.99 4.36 313 966 106.9 % - a
# Put all the numeric variales after the Subj variable
%>% relocate(where(is.numeric), .after = Subj) %>% slice(1) vdata
## V Tense Cons Rate Subj X Y F1 F2 dur
## 1 % - P a bk 52.99 4.36 313 966 106.9
# Put all the character variables before the variable dur
%>% relocate(where(is.character), .before = dur) %>% slice(1) vdata
## X Y F1 F2 V Tense Cons Rate Subj dur
## 1 52.99 4.36 313 966 % - P a bk 106.9