Example dataset
id
|
name
|
phone
|
sex_and_age
|
test_number
|
term_1
|
term_2
|
term_3
|
1
|
Mike
|
134
|
m_12
|
test 1
|
76
|
84
|
87
|
2
|
Linda
|
270
|
f_13
|
test 1
|
88
|
90
|
73
|
3
|
Sam
|
210
|
m_11
|
test 1
|
78
|
74
|
80
|
4
|
Esther
|
617
|
f_12
|
test 1
|
68
|
75
|
74
|
5
|
Mary
|
114
|
f_14
|
test 1
|
65
|
67
|
64
|
1
|
Mike
|
134
|
m_12
|
test 2
|
85
|
80
|
90
|
2
|
Linda
|
270
|
f_13
|
test 2
|
87
|
82
|
94
|
3
|
Sam
|
210
|
m_11
|
test 2
|
80
|
87
|
80
|
4
|
Esther
|
617
|
f_12
|
test 2
|
70
|
75
|
78
|
5
|
Mary
|
114
|
f_14
|
test 2
|
68
|
70
|
63
|
Installing / loading tidyverse
# install.packages("tidyverse")
library(tidyverse)
# OR
# install.packages("dplyr")
library(dplyr)
# install.packages("tidyr")
library(tidyr)
Filter and select
# Use filter to subset to specific rows
filter(data1, test_number == 'test 1')
## # A tibble: 5 × 8
## id name phone sex_and_age test_number term_1 term_2 term_3
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 Mike 134 m_12 test 1 76 84 87
## 2 2 Linda 270 f_13 test 1 88 90 73
## 3 3 Sam 210 m_11 test 1 78 74 80
## 4 4 Esther 617 f_12 test 1 68 75 74
## 5 5 Mary 114 f_14 test 1 65 67 64
# Use select to subset to specific columns
select(data1, name, phone)
## # A tibble: 10 × 2
## name phone
## <chr> <chr>
## 1 Mike 134
## 2 Linda 270
## 3 Sam 210
## 4 Esther 617
## 5 Mary 114
## 6 Mike 134
## 7 Linda 270
## 8 Sam 210
## 9 Esther 617
## 10 Mary 114
select(data1, starts_with('term'))
## # A tibble: 10 × 3
## term_1 term_2 term_3
## <dbl> <dbl> <dbl>
## 1 76 84 87
## 2 88 90 73
## 3 78 74 80
## 4 68 75 74
## 5 65 67 64
## 6 85 80 90
## 7 87 82 94
## 8 80 87 80
## 9 70 75 78
## 10 68 70 63
Pipe
# These three are equivalent
filter(data1, test_number == 'test 1')
## # A tibble: 5 × 8
## id name phone sex_and_age test_number term_1 term_2 term_3
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 Mike 134 m_12 test 1 76 84 87
## 2 2 Linda 270 f_13 test 1 88 90 73
## 3 3 Sam 210 m_11 test 1 78 74 80
## 4 4 Esther 617 f_12 test 1 68 75 74
## 5 5 Mary 114 f_14 test 1 65 67 64
data1 %>% filter(test_number == 'test 1')
## # A tibble: 5 × 8
## id name phone sex_and_age test_number term_1 term_2 term_3
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 Mike 134 m_12 test 1 76 84 87
## 2 2 Linda 270 f_13 test 1 88 90 73
## 3 3 Sam 210 m_11 test 1 78 74 80
## 4 4 Esther 617 f_12 test 1 68 75 74
## 5 5 Mary 114 f_14 test 1 65 67 64
data1 %>% filter(., test_number == 'test 1')
## # A tibble: 5 × 8
## id name phone sex_and_age test_number term_1 term_2 term_3
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 Mike 134 m_12 test 1 76 84 87
## 2 2 Linda 270 f_13 test 1 88 90 73
## 3 3 Sam 210 m_11 test 1 78 74 80
## 4 4 Esther 617 f_12 test 1 68 75 74
## 5 5 Mary 114 f_14 test 1 65 67 64
# We can do multiple operations at once with the pipe operator
data1 %>%
filter(test_number == 'test 1') %>%
select(name, starts_with('term'))
## # A tibble: 5 × 4
## name term_1 term_2 term_3
## <chr> <dbl> <dbl> <dbl>
## 1 Mike 76 84 87
## 2 Linda 88 90 73
## 3 Sam 78 74 80
## 4 Esther 68 75 74
## 5 Mary 65 67 64
# Equivalent to doing in two stages
data_tmp <- data1 %>% filter(test_number == 'test 1')
data_tmp %>% select(name, starts_with('term'))
## # A tibble: 5 × 4
## name term_1 term_2 term_3
## <chr> <dbl> <dbl> <dbl>
## 1 Mike 76 84 87
## 2 Linda 88 90 73
## 3 Sam 78 74 80
## 4 Esther 68 75 74
## 5 Mary 65 67 64
Reshaping data (longer)
# Lets restructure the data so that there is a single column with all the results
# And the rows represent a person in a given term for a given test
data_long <- data1 %>%
pivot_longer(starts_with('term'), values_to = 'Results', names_to = 'Term')
data_long
## # A tibble: 30 × 7
## id name phone sex_and_age test_number Term Results
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 1 Mike 134 m_12 test 1 term_1 76
## 2 1 Mike 134 m_12 test 1 term_2 84
## 3 1 Mike 134 m_12 test 1 term_3 87
## 4 2 Linda 270 f_13 test 1 term_1 88
## 5 2 Linda 270 f_13 test 1 term_2 90
## 6 2 Linda 270 f_13 test 1 term_3 73
## 7 3 Sam 210 m_11 test 1 term_1 78
## 8 3 Sam 210 m_11 test 1 term_2 74
## 9 3 Sam 210 m_11 test 1 term_3 80
## 10 4 Esther 617 f_12 test 1 term_1 68
## # ℹ 20 more rows
Reshaping data (wider)
# Lets restructure the data so that there is a separate column for results of test 1 and 2
# And the rows represent a person
data_wider <- data1 %>%
pivot_wider(values_from = starts_with('term'), names_from = 'test_number')
data_wider
## # A tibble: 5 × 10
## id name phone sex_and_age `term_1_test 1` `term_1_test 2` `term_2_test 1`
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 Mike 134 m_12 76 85 84
## 2 2 Linda 270 f_13 88 87 90
## 3 3 Sam 210 m_11 78 80 74
## 4 4 Esther 617 f_12 68 70 75
## 5 5 Mary 114 f_14 65 68 67
## # ℹ 3 more variables: `term_2_test 2` <dbl>, `term_3_test 1` <dbl>,
## # `term_3_test 2` <dbl>
Merging / joining
# New dataset
data2 <- tribble(
~id, ~name, ~country,
"1","Mike","UK",
"2","Linda","USA",
"3","Sam","Canada",
"4","Esther","USA",
"5","Mary","Ireland",
)
kbl(data2, format = 'html') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
id
|
name
|
country
|
1
|
Mike
|
UK
|
2
|
Linda
|
USA
|
3
|
Sam
|
Canada
|
4
|
Esther
|
USA
|
5
|
Mary
|
Ireland
|
# Add this data to the main data, to create a country column
## Using merge
combined_data <- merge(data1, data2, by = c("id", "name"))
## Using left_join
combined_data <- data1 %>%
left_join(data2, by = c("id", "name"))
combined_data
## # A tibble: 10 × 9
## id name phone sex_and_age test_number term_1 term_2 term_3 country
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 1 Mike 134 m_12 test 1 76 84 87 UK
## 2 2 Linda 270 f_13 test 1 88 90 73 USA
## 3 3 Sam 210 m_11 test 1 78 74 80 Canada
## 4 4 Esther 617 f_12 test 1 68 75 74 USA
## 5 5 Mary 114 f_14 test 1 65 67 64 Ireland
## 6 1 Mike 134 m_12 test 2 85 80 90 UK
## 7 2 Linda 270 f_13 test 2 87 82 94 USA
## 8 3 Sam 210 m_11 test 2 80 87 80 Canada
## 9 4 Esther 617 f_12 test 2 70 75 78 USA
## 10 5 Mary 114 f_14 test 2 68 70 63 Ireland
Split out sex and age
sex_age <- str_split_fixed(data1$sex_and_age, pattern = "_", n = 2) %>%
data.frame()
colnames(sex_age) <- c("Sex", "Age")
data3 <- data.frame(data1, sex_age) %>%
select(-sex_and_age)
data3
## id name phone test_number term_1 term_2 term_3 Sex Age
## 1 1 Mike 134 test 1 76 84 87 m 12
## 2 2 Linda 270 test 1 88 90 73 f 13
## 3 3 Sam 210 test 1 78 74 80 m 11
## 4 4 Esther 617 test 1 68 75 74 f 12
## 5 5 Mary 114 test 1 65 67 64 f 14
## 6 1 Mike 134 test 2 85 80 90 m 12
## 7 2 Linda 270 test 2 87 82 94 f 13
## 8 3 Sam 210 test 2 80 87 80 m 11
## 9 4 Esther 617 test 2 70 75 78 f 12
## 10 5 Mary 114 test 2 68 70 63 f 14