library(tidyverse)
4 Data Wrangling I: Creating Variables
To keep it nice and simple, we’ll use yet another university dataset:
<- data.frame(
dat3 studs = c(14954, 47269, 23659, 9415, 38079),
profs = c(250, 553, 438, 150, 636),
prom_recht = c(FALSE, TRUE, TRUE, TRUE, FALSE),
gegr = c(1971, 1870, 1457, 1818, 1995),
uni = c("FH Aachen", "RWTH Aachen", "Uni Freiburg", "Uni Bonn", "FH Bonn-Rhein-Sieg")
)
4.1 Creating Variables
Let’s take a closer look at creating variables in R. There are two basic ways to add variables to a data.frame
:
Base R:
...$newvar <-
{dplyr}
:mutate(new_var= )
4.1.1 Base R: ...$newvar <-
$studs_to_mean <- dat3$studs - mean(dat3$studs)
dat3 dat3
studs profs prom_recht gegr uni studs_to_mean
1 14954 250 FALSE 1971 FH Aachen -11721.2
2 47269 553 TRUE 1870 RWTH Aachen 20593.8
3 23659 438 TRUE 1457 Uni Freiburg -3016.2
4 9415 150 TRUE 1818 Uni Bonn -17260.2
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg 11403.8
You can also delete variables using <- NULL
:
$studs_to_mean <- NULL
dat3 dat3
studs profs prom_recht gegr uni
1 14954 250 FALSE 1971 FH Aachen
2 47269 553 TRUE 1870 RWTH Aachen
3 23659 438 TRUE 1457 Uni Freiburg
4 9415 150 TRUE 1818 Uni Bonn
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg
4.1.2 {dplyr}: mutate(new_var= )
An alternative way to create variables is using mutate(new_variable = )
from {dplyr}
({tidyverse}
):
%>% mutate(studs_to_mean = studs - mean(studs)) dat3
studs profs prom_recht gegr uni studs_to_mean
1 14954 250 FALSE 1971 FH Aachen -11721.2
2 47269 553 TRUE 1870 RWTH Aachen 20593.8
3 23659 438 TRUE 1457 Uni Freiburg -3016.2
4 9415 150 TRUE 1818 Uni Bonn -17260.2
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg 11403.8
You can also create multiple variables within a single mutate()
command:
%>% mutate(
dat3 studs_to_mean = studs - mean(studs),
profs_to_mean = profs - mean(profs)
)
studs profs prom_recht gegr uni studs_to_mean profs_to_mean
1 14954 250 FALSE 1971 FH Aachen -11721.2 -155.4
2 47269 553 TRUE 1870 RWTH Aachen 20593.8 147.6
3 23659 438 TRUE 1457 Uni Freiburg -3016.2 32.6
4 9415 150 TRUE 1818 Uni Bonn -17260.2 -255.4
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg 11403.8 230.6
Or variables can be reused within mutate()
:
%>% mutate(
dat3 rel_to_mean = studs - mean(studs),
above_mean = rel_to_mean > 0
)
studs profs prom_recht gegr uni rel_to_mean above_mean
1 14954 250 FALSE 1971 FH Aachen -11721.2 FALSE
2 47269 553 TRUE 1870 RWTH Aachen 20593.8 TRUE
3 23659 438 TRUE 1457 Uni Freiburg -3016.2 FALSE
4 9415 150 TRUE 1818 Uni Bonn -17260.2 FALSE
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg 11403.8 TRUE
The original dataset remains unchanged:
dat3
studs profs prom_recht gegr uni
1 14954 250 FALSE 1971 FH Aachen
2 47269 553 TRUE 1870 RWTH Aachen
3 23659 438 TRUE 1457 Uni Freiburg
4 9415 150 TRUE 1818 Uni Bonn
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg
To keep the results, store them in an object:
<- dat3 %>% mutate(
dat4 rel_to_mean = studs - mean(studs),
above_mean = rel_to_mean > 0
)
dat4
studs profs prom_recht gegr uni rel_to_mean above_mean
1 14954 250 FALSE 1971 FH Aachen -11721.2 FALSE
2 47269 553 TRUE 1870 RWTH Aachen 20593.8 TRUE
3 23659 438 TRUE 1457 Uni Freiburg -3016.2 FALSE
4 9415 150 TRUE 1818 Uni Bonn -17260.2 FALSE
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg 11403.8 TRUE
as.numeric()
You can convert logical variables into numeric dummy variables (0/1) using as.numeric()
:
%>% mutate(
dat3 prom_dummy = as.numeric(prom_recht),
over10k = as.numeric(studs > 10000)
)
studs profs prom_recht gegr uni prom_dummy over10k
1 14954 250 FALSE 1971 FH Aachen 0 1
2 47269 553 TRUE 1870 RWTH Aachen 1 1
3 23659 438 TRUE 1457 Uni Freiburg 1 1
4 9415 150 TRUE 1818 Uni Bonn 1 0
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg 0 1
4.2 Grouping with .by=
The true power of mutate()
becomes apparent when combined with other {dplyr}
functions. A common task in data preparation involves grouped values.
We’ll make our example dataset a bit smaller:
<- dat3 %>%
dat5 select(-uni,-gegr) # to ensure everything is visible
Since {dplyr}
version 1.1.1, we can specify a grouping directly in mutate()
using the .by=
argument. This .by=
grouping is applied only to the immediate calculations within mutate()
:
%>%
dat5 mutate(m_studs = mean(studs),
m_profs = mean(profs)) %>%
mutate(m_studs2 = mean(studs),
.by = prom_recht) %>%
mutate(m_profs2 = mean(profs))
studs profs prom_recht m_studs m_profs m_studs2 m_profs2
1 14954 250 FALSE 26675.2 405.4 26516.5 405.4
2 47269 553 TRUE 26675.2 405.4 26781.0 405.4
3 23659 438 TRUE 26675.2 405.4 26781.0 405.4
4 9415 150 TRUE 26675.2 405.4 26781.0 405.4
5 38079 636 FALSE 26675.2 405.4 26516.5 405.4
Using summarise()
instead of mutate()
provides an overview:
%>%
dat5 summarise(m_studs = mean(studs),.by = prom_recht)
prom_recht m_studs
1 FALSE 26516.5
2 TRUE 26781.0
group_by()
Before {dplyr}
1.1.1, grouping a dataset relied on group_by()
. After setting group_by()
along the values of a variable, all subsequent mutate()
calculations are performed only within those groups:
%>%
dat5 mutate(m_studs = mean(studs),
m_profs = mean(profs)) %>%
group_by(prom_recht) %>%
mutate(m_studs2 = mean(studs),
m_profs2 = mean(profs))
# A tibble: 5 × 7
# Groups: prom_recht [2]
studs profs prom_recht m_studs m_profs m_studs2 m_profs2
<dbl> <dbl> <lgl> <dbl> <dbl> <dbl> <dbl>
1 14954 250 FALSE 26675. 405. 26516. 443
2 47269 553 TRUE 26675. 405. 26781 380.
3 23659 438 TRUE 26675. 405. 26781 380.
4 9415 150 TRUE 26675. 405. 26781 380.
5 38079 636 FALSE 26675. 405. 26516. 443
After using group_by()
, it’s good practice to remove the grouping with ungroup()
once it’s no longer needed:
%>%
dat5 mutate(m_studs = mean(studs),
m_profs = mean(profs)) %>%
group_by(prom_recht) %>%
mutate(m_studs2 = mean(studs)) %>%
ungroup() %>%
mutate(m_profs2 = mean(profs))
# A tibble: 5 × 7
studs profs prom_recht m_studs m_profs m_studs2 m_profs2
<dbl> <dbl> <lgl> <dbl> <dbl> <dbl> <dbl>
1 14954 250 FALSE 26675. 405. 26516. 405.
2 47269 553 TRUE 26675. 405. 26781 405.
3 23659 438 TRUE 26675. 405. 26781 405.
4 9415 150 TRUE 26675. 405. 26781 405.
5 38079 636 FALSE 26675. 405. 26516. 405.
4.3 across()
: Processing Multiple Variables
A highly versatile addition to mutate()
and summarise()
is across()
. This allows us to apply a function to multiple columns simultaneously, without repeating code:
%>%
dat3 summarise(studs = mean(studs),
profs = mean(profs))
studs profs
1 26675.2 405.4
Here, across()
offers a much shorter syntax for variable selection, and we can use ?select_helpers
like matches()
:
%>%
dat3 summarise(across(.cols = matches("studs|profs"),.fns = ~mean(.x)))
studs profs
1 26675.2 405.4
This is also compatible with .by=
:
%>%
dat3 summarise(across(matches("studs|profs"), ~mean(.x)), .by= prom_recht)
prom_recht studs profs
1 FALSE 26516.5 443.0000
2 TRUE 26781.0 380.3333
For more examples on how apply multiple functions, include renaming etc. see below
4.3.1 Exercise
4.4 Custom Functions
So far, we only used functions written by others. We can also use own functions to avoid repetition:
To do so, let’s examine three satisfaction variables for respondents in rows 12-16:
variable | Important regarding job |
---|---|
|
To earn a lot of money |
|
A job, that is fun |
|
Good career opportunities |
|
Job security |
|
Job where you can show off your abilities |
-10 bis -1 | 1 | 2 | 3 | 4 |
---|---|---|---|---|
t.n.z./k.A. | Very important | Rather important | Rather not important | Not important at all |
<- haven::read_dta("./orig/PENDDAT_cf_W13.dta")
pend
<-
sat_small %>%
pend filter(welle == 1) %>%
select(matches("PEO0300(a|b|c)")) %>%
slice(12:16) %>%
::zap_labels() %>% haven::zap_label() # remove labels
haven sat_small
# A tibble: 5 × 3
PEO0300a PEO0300b PEO0300c
<dbl> <dbl> <dbl>
1 2 3 2
2 1 1 3
3 1 1 3
4 2 1 1
5 1 1 2
<- sat_small %>% mutate(across(everything(),~as.numeric(.x))) sat_small
Sometimes we want to process multiple variables in the same way. Above, we saw how to handle this with across()
for existing functions. But what if we want to perform a calculation that isn’t as simple as applying mean()
, sd()
, etc.?
%>%
sat_small mutate(dmean_PEO0300a = PEO0300a - mean(PEO0300a,na.rm = T),
dmean_PEO0300c = PEO0300c - mean(PEO0300c,na.rm = T))
# A tibble: 5 × 5
PEO0300a PEO0300b PEO0300c dmean_PEO0300a dmean_PEO0300c
<dbl> <dbl> <dbl> <dbl> <dbl>
1 2 3 2 0.6 -0.200
2 1 1 3 -0.4 0.8
3 1 1 3 -0.4 0.8
4 2 1 1 0.6 -1.2
5 1 1 2 -0.4 -0.200
…and now what about F1450_06
? Typing this out three times would violate the “DRY” principle1, especially considering the PASS CampusFile contains 5 columns of similar satisfaction variables. Copying and pasting is not a practical option.
Custom functions allow us to adhere to the DRY principle in R. We’ll make our calculation steps part of a function()
and apply it to the desired variables. A function takes an input, defined as a placeholder within the ()
. This placeholder is used within the function, and we return the result with return()
. Only one output can be returned:
<- function(x){
dtomean <- x - mean(x,na.rm = T)
d_x return(d_x)
}
How can we now apply our function dtomean()
to the variables from our sat_small
?
In principle, we saw at the beginning that a data.frame
is simply a combined collection of vectors (the variables).
Accordingly, we can now apply our dtomean()
to a variable (a vector) by calling it with data.frame$variablename
:
dtomean(sat_small$PEO0300a)
[1] 0.6 -0.4 -0.4 0.6 -0.4
To apply our function to each variable of a data.frame
, we can use lapply()
- the output will then be a list, with elements named after the variable names:
lapply(sat_small,FUN = dtomean)
$PEO0300a
[1] 0.6 -0.4 -0.4 0.6 -0.4
$PEO0300b
[1] 1.6 -0.4 -0.4 -0.4 -0.4
$PEO0300c
[1] -0.2 0.8 0.8 -1.2 -0.2
<- lapply(sat_small,FUN = dtomean)
res class(res)
[1] "list"
map()
from {purrr}
is an alternative to lapply
:
%>% map(~dtomean(.x)) sat_small
$PEO0300a
[1] 0.6 -0.4 -0.4 0.6 -0.4
$PEO0300b
[1] 1.6 -0.4 -0.4 -0.4 -0.4
$PEO0300c
[1] -0.2 0.8 0.8 -1.2 -0.2
This formula syntax can also be found in across()
- additionally, with .names =
we have the option to modify the variable names for the results:
%>%
sat_small mutate(across(matches("PEO0300"),~dtomean(.x)) )
# A tibble: 5 × 3
PEO0300a PEO0300b PEO0300c
<dbl> <dbl> <dbl>
1 0.6 1.6 -0.200
2 -0.4 -0.4 0.8
3 -0.4 -0.4 0.8
4 0.6 -0.4 -1.2
5 -0.4 -0.4 -0.200
4.4.1 Exercise
4.5 Exercises
4.5.1 Exercise
- Use the
pend_small
dataset:
<- haven::read_dta("./orig/PENDDAT_cf_W13.dta",
pend_small col_select = c("welle","zpsex","PEO0400a","PEO0400b","PEO0400c","PEO0400d")
%>%
) ::zap_labels() %>% # drop labels to have a clean data.frame
havenfilter(welle == 2) %>%
slice(1:10)
- Calculate the mean for the variables PEO0400a by gender (
zpsex
): - Calculate the mean for the variables
PEO0400a
,PEO0400b
,PEO0400c
, andPEO0400d
by gender (zpsex
): - Use
across()
to calculate the means for all four variables.
var | lab |
---|---|
|
Family/job: Woman should be willing to reduce her working hours for family |
|
Family/job: What women really want is a home and children |
|
Family/job: Working mother can have an equally warm relationship with her childr |
|
Family/job: Responsibility of husband: To earn money; responsibility of wife: Ho |
welle zpsex PEO0400a PEO0400b PEO0400c PEO0400d
1 2 2 1 1 4 1
2 2 1 2 1 3 2
3 2 2 1 3 1 4
4 2 2 1 1 4 1
5 2 2 1 1 1 1
6 2 1 1 1 1 1
7 2 1 1 2 1 4
8 2 1 3 2 3 2
9 2 2 3 3 3 3
10 2 1 2 3 2 2
- Standardize the variables
PEO0400a
-PEO0400d
frompend_small
using the following pattern:
%>%
pend_small mutate(std_PEO0400b = (PEO0400b - mean(PEO0400b,na.rm = T))/sd(PEO0400b,na.rm = T))
- Use a function so that you don’t have to repeatedly enter the same steps.
- Additionally, use
across()
to apply the function to the desired variables. - Calculate the standardization separately by gender (
zpsex
) using.by =
.
4.5.2 Exercise
Continue using pend_small
:
pend_small
welle zpsex PEO0400a PEO0400b PEO0400c PEO0400d
1 2 2 1 1 4 1
2 2 1 2 1 3 2
3 2 2 1 3 1 4
4 2 2 1 1 4 1
5 2 2 1 1 1 1
6 2 1 1 1 1 1
7 2 1 1 2 1 4
8 2 1 3 2 3 2
9 2 2 3 3 3 3
10 2 1 2 3 2 2
- Standardize the variables
PEO0400a
-PEO0400d
frompend_small
using the following pattern:
%>%
pend_small mutate(std_PEO0400b = (PEO0400b - mean(PEO0400b,na.rm = T))/sd(PEO0400b,na.rm = T))
- Use a function so that you don’t have to repeatedly enter the same steps.
- Additionally, use
across()
to apply the function to the desired variables.
4.6 Appendix
4.6.1 across()
: Processing Multiple Variables
A highly versatile addition to mutate()
and summarise()
is across()
. This allows us to apply a function to multiple columns simultaneously, without repeating code:
%>%
dat3 summarise(studs = mean(studs),
profs = mean(profs))
studs profs
1 26675.2 405.4
Here, across()
offers a much shorter syntax for variable selection, and we can use ?select_helpers
like matches()
:
%>%
dat3 summarise(across(.cols = matches("studs|profs"),.fns = ~mean(.x)))
studs profs
1 26675.2 405.4
This is also compatible with .by=
:
%>%
dat3 summarise(across(matches("studs|profs"), ~mean(.x)), .by= prom_recht)
prom_recht studs profs
1 FALSE 26516.5 443.0000
2 TRUE 26781.0 380.3333
We can apply multiple functions by placing them in a list()
:
%>%
dat3 summarise(across(matches("studs|profs"), list(mean = ~mean(.x), sd = ~sd(.x))), .by= prom_recht)
prom_recht studs_mean studs_sd profs_mean profs_sd
1 FALSE 26516.5 16351.84 443.0000 272.9432
2 TRUE 26781.0 19119.14 380.3333 207.5966
You can define this list()
in advance and use it later:
<- list(MEAN = ~mean(.x), SD = ~sd(.x))
wert_liste
%>%
dat3 summarise(across(matches("studs|profs"), wert_liste), .by= prom_recht)
prom_recht studs_MEAN studs_SD profs_MEAN profs_SD
1 FALSE 26516.5 16351.84 443.0000 272.9432
2 TRUE 26781.0 19119.14 380.3333 207.5966
The .names()
argument allows us to control the naming of columns. {.fn}
stands for the function being applied, and {.col}
represents the name of the variable being processed.
%>%
dat3 summarise(across(matches("studs|profs"),
wert_liste,.names = "{.fn}_{.col}"),
.by= prom_recht)
prom_recht MEAN_studs SD_studs MEAN_profs SD_profs
1 FALSE 26516.5 16351.84 443.0000 272.9432
2 TRUE 26781.0 19119.14 380.3333 207.5966
All these functions also work with mutate()
:
%>%
dat3 mutate(across(matches("studs|profs"),
wert_liste, .names = "{.col}XX{.fn}"))
studs profs prom_recht gegr uni studsXXMEAN studsXXSD
1 14954 250 FALSE 1971 FH Aachen 26675.2 15799.92
2 47269 553 TRUE 1870 RWTH Aachen 26675.2 15799.92
3 23659 438 TRUE 1457 Uni Freiburg 26675.2 15799.92
4 9415 150 TRUE 1818 Uni Bonn 26675.2 15799.92
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg 26675.2 15799.92
profsXXMEAN profsXXSD
1 405.4 203.349
2 405.4 203.349
3 405.4 203.349
4 405.4 203.349
5 405.4 203.349
More examples in the across() documentation
4.6.1.1 Exercise
4.6.2 Helper functions
4.6.2.1 ifelse()
ifelse()
is a great help for all recoding tasks: we formulate a condition and if it is met, the first value is used; if not, the second value is used. Here we check whether studs-mean(studs)
is greater than 0
- if so, above
is used, otherwise below
:
%>% mutate(rel_to_mean = studs-mean(studs),
dat3 ab_mean_lab = ifelse(rel_to_mean > 0,"above","below"))
studs profs prom_recht gegr uni rel_to_mean ab_mean_lab
1 14954 250 FALSE 1971 FH Aachen -11721.2 below
2 47269 553 TRUE 1870 RWTH Aachen 20593.8 above
3 23659 438 TRUE 1457 Uni Freiburg -3016.2 below
4 9415 150 TRUE 1818 Uni Bonn -17260.2 below
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg 11403.8 above
This can be helpful to replace negative values with NA
, for example in the PASS data:
<- haven::read_dta("./orig/PENDDAT_cf_W13.dta",
pend_small2 col_select = c("palter","PEO0400a","PEO0400b","PEO0400c","statakt")) %>%
slice(5624:5640)
The basic idea is to use ifelse()
to replace negative values in a variable with NA
:
%>% mutate(PEO0400a = ifelse(PEO0400a<0,NA,PEO0400a)) pend_small2
across()
allows us to apply this ifelse()
-function to replace NA
in PEO0400a
,PEO0400b
, PEO0400c
and statakt
:
%>% mutate(across(c("PEO0400a","PEO0400b","PEO0400c","statakt"), ~ifelse(.x<0,NA,.x))) pend_small2
# A tibble: 17 × 5
palter PEO0400a PEO0400b PEO0400c statakt
<dbl+lbl> <dbl> <dbl> <dbl> <dbl>
1 77 1 3 3 NA
2 78 NA NA NA NA
3 51 2 4 1 NA
4 23 3 3 2 NA
5 17 3 2 1 NA
6 47 3 2 2 NA
7 24 3 4 1 1
8 52 2 3 1 1
9 19 2 3 2 3
10 48 2 3 1 1
11 49 NA NA NA NA
12 47 2 3 1 NA
13 48 2 3 1 1
14 49 NA NA NA 1
15 39 4 3 1 NA
16 37 3 4 1 NA
17 38 3 3 1 1
%>% mutate(across(matches("PEO0400|statakt"), ~ifelse(.x<0,NA,.x))) # even shorter: matches() pend_small2
# A tibble: 17 × 5
palter PEO0400a PEO0400b PEO0400c statakt
<dbl+lbl> <dbl> <dbl> <dbl> <dbl>
1 77 1 3 3 NA
2 78 NA NA NA NA
3 51 2 4 1 NA
4 23 3 3 2 NA
5 17 3 2 1 NA
6 47 3 2 2 NA
7 24 3 4 1 1
8 52 2 3 1 1
9 19 2 3 2 3
10 48 2 3 1 1
11 49 NA NA NA NA
12 47 2 3 1 NA
13 48 2 3 1 1
14 49 NA NA NA 1
15 39 4 3 1 NA
16 37 3 4 1 NA
17 38 3 3 1 1
4.6.2.2 case_when()
case_when()
({dplyr}
) extends the principle ifelse()
, allowing us to specify more than two options.
The syntax is slightly different: first, we specify the condition, then after a ~
the values to be used:
%>% mutate(age = case_when(gegr < 1500 ~ "very old",
dat3 < 1900 ~ "old")) gegr
studs profs prom_recht gegr uni age
1 14954 250 FALSE 1971 FH Aachen <NA>
2 47269 553 TRUE 1870 RWTH Aachen old
3 23659 438 TRUE 1457 Uni Freiburg very old
4 9415 150 TRUE 1818 Uni Bonn old
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg <NA>
With TRUE
, we can address all cases that have not met any conditions so far:
%>% mutate(age = case_when(gegr < 1500 ~ "very old",
dat3 < 1900 ~ "old",
gegr TRUE ~ "relatively new"))
studs profs prom_recht gegr uni age
1 14954 250 FALSE 1971 FH Aachen relatively new
2 47269 553 TRUE 1870 RWTH Aachen old
3 23659 438 TRUE 1457 Uni Freiburg very old
4 9415 150 TRUE 1818 Uni Bonn old
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg relatively new
This doesn’t have to be limited to one variable:
%>% mutate(age = case_when(gegr < 1500 & prom_recht == T ~ "very old university",
dat3 < 1900 & prom_recht == T ~ "old university",
gegr > 1900 & prom_recht == T ~ "young university",
gegr < 1900 & prom_recht == F ~ "old college",
gegr > 1900 & prom_recht == F ~ "young college")) gegr
studs profs prom_recht gegr uni age
1 14954 250 FALSE 1971 FH Aachen young college
2 47269 553 TRUE 1870 RWTH Aachen old university
3 23659 438 TRUE 1457 Uni Freiburg very old university
4 9415 150 TRUE 1818 Uni Bonn old university
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg young college
4.6.2.3 cut()
: creating classes
dat3
studs profs prom_recht gegr uni
1 14954 250 FALSE 1971 FH Aachen
2 47269 553 TRUE 1870 RWTH Aachen
3 23659 438 TRUE 1457 Uni Freiburg
4 9415 150 TRUE 1818 Uni Bonn
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg
A common task in data preparation is classifying a continuous variable, such as the number of professors. We want to group profs
in steps of 150. To create these classes, we use cut()
and specify the class boundaries with breaks
. We can use seq()
to generate the breakpoints. In seq()
, we specify the lower and upper limits along with the step size.
cut(dat3$profs,breaks = c(50, 200, 350, 500, 650))
[1] (200,350] (500,650] (350,500] (50,200] (500,650]
Levels: (50,200] (200,350] (350,500] (500,650]
cut(dat3$profs,breaks = seq(50,650,150))
[1] (200,350] (500,650] (350,500] (50,200] (500,650]
Levels: (50,200] (200,350] (350,500] (500,650]
We store these values in a new variable in the dat3
dataset:
$prof_class <- cut(dat3$profs,breaks = seq(50,650,150))
dat3 dat3
studs profs prom_recht gegr uni prof_class
1 14954 250 FALSE 1971 FH Aachen (200,350]
2 47269 553 TRUE 1870 RWTH Aachen (500,650]
3 23659 438 TRUE 1457 Uni Freiburg (350,500]
4 9415 150 TRUE 1818 Uni Bonn (50,200]
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg (500,650]
For this new variable, we can request a frequency table using count()
:
%>% count(prof_class) dat3
prof_class n
1 (50,200] 1
2 (200,350] 1
3 (350,500] 1
4 (500,650] 2
The parentheses (
indicate exclusion, while the brackets ]
indicate inclusion. There are 1 universities in the dataset that have more than 200 and up to 350 professors.
For the following examples, we delete the prof_class
variable again:
$prof_class <- NULL dat3
Some useful options for cut()
in the appendix
<- c(1990,1998,2001,2009)
bsp bsp
[1] 1990 1998 2001 2009
cut(bsp,breaks = c(1990,2000,2010))
[1] <NA> (1.99e+03,2e+03] (2e+03,2.01e+03] (2e+03,2.01e+03]
Levels: (1.99e+03,2e+03] (2e+03,2.01e+03]
# Specify the number of digits in the labels
cut(bsp,breaks = c(1990,2000,2010),dig.lab = 4)
[1] <NA> (1990,2000] (2000,2010] (2000,2010]
Levels: (1990,2000] (2000,2010]
# Include the lower boundary
cut(bsp,breaks = c(1990,2000,2010),dig.lab = 4,include.lowest = T)
[1] [1990,2000] [1990,2000] (2000,2010] (2000,2010]
Levels: [1990,2000] (2000,2010]
# Number the categories instead of labels:
cut(bsp,breaks = c(1990,2000,2010),labels = FALSE)
[1] NA 1 2 2
# Specify your own labels:
cut(bsp,breaks = c(1990,2000,2010),labels = c("90s","00s"))
[1] <NA> 90s 00s 00s
Levels: 90s 00s
4.6.3 Renaming variables
To rename variables, use rename(new_name = old_name)
%>% rename(newname = PEO0300a) sat_small
# A tibble: 5 × 3
newname PEO0300b PEO0300c
<dbl> <dbl> <dbl>
1 2 3 2
2 1 1 3
3 1 1 3
4 2 1 1
5 1 1 2
For advanced transformations, it’s worth looking into rename_with()
. This allows us to use Regular Expressions, for example from {stringr}. Here’s just an example:
%>% rename_with(~tolower(.)) sat_small
# A tibble: 5 × 3
peo0300a peo0300b peo0300c
<dbl> <dbl> <dbl>
1 2 3 2
2 1 1 3
3 1 1 3
4 2 1 1
5 1 1 2
%>% rename_with(~str_remove(.x,"PEO0300")) sat_small
# A tibble: 5 × 3
a b c
<dbl> <dbl> <dbl>
1 2 3 2
2 1 1 3
3 1 1 3
4 2 1 1
5 1 1 2
%>% rename_with(~str_replace(.x,"PEO0300","Occupation_")) sat_small
# A tibble: 5 × 3
Occupation_a Occupation_b Occupation_c
<dbl> <dbl> <dbl>
1 2 3 2
2 1 1 3
3 1 1 3
4 2 1 1
5 1 1 2
4.6.4 String Functions for regex
{stringr} provides a series of very useful string functions with regular expressions. You can get an overview from this cheatsheet.
%>% mutate(uni_fh = str_detect(uni,"Uni")) dat3
studs profs prom_recht gegr uni uni_fh
1 14954 250 FALSE 1971 FH Aachen FALSE
2 47269 553 TRUE 1870 RWTH Aachen FALSE
3 23659 438 TRUE 1457 Uni Freiburg TRUE
4 9415 150 TRUE 1818 Uni Bonn TRUE
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg FALSE
%>% mutate(bula = case_when(str_detect(uni,"Bremen")~ "HB",
dat3 str_detect(uni,"Oldenb|Vechta")~ "NDS",
str_detect(uni,"Bonn|Aachen")~ "NRW",
str_detect(uni,"Freiburg")~ "BW"
))
studs profs prom_recht gegr uni bula
1 14954 250 FALSE 1971 FH Aachen NRW
2 47269 553 TRUE 1870 RWTH Aachen NRW
3 23659 438 TRUE 1457 Uni Freiburg BW
4 9415 150 TRUE 1818 Uni Bonn NRW
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg NRW
%>% mutate(ort = str_remove(uni,"Uni |FH |RWTH ")) dat3
studs profs prom_recht gegr uni ort
1 14954 250 FALSE 1971 FH Aachen Aachen
2 47269 553 TRUE 1870 RWTH Aachen Aachen
3 23659 438 TRUE 1457 Uni Freiburg Freiburg
4 9415 150 TRUE 1818 Uni Bonn Bonn
5 38079 636 FALSE 1995 FH Bonn-Rhein-Sieg Bonn-Rhein-Sieg
Do not repeat yourself, see Wickham et al: “You should consider writing a function whenever you’ve copied and pasted a block of code more than twice (i.e. you now have three copies of the same code).”↩︎