Example dataset

id name phone sex_and_age test_number term_1 term_2 term_3
1 Mike 134 m_12 test 1 76 84 87
2 Linda 270 f_13 test 1 88 90 73
3 Sam 210 m_11 test 1 78 74 80
4 Esther 617 f_12 test 1 68 75 74
5 Mary 114 f_14 test 1 65 67 64
1 Mike 134 m_12 test 2 85 80 90
2 Linda 270 f_13 test 2 87 82 94
3 Sam 210 m_11 test 2 80 87 80
4 Esther 617 f_12 test 2 70 75 78
5 Mary 114 f_14 test 2 68 70 63

Installing / loading tidyverse

# install.packages("tidyverse")
library(tidyverse)

# OR

# install.packages("dplyr")
library(dplyr)

# install.packages("tidyr")
library(tidyr)

Filter and select

# Use filter to subset to specific rows

filter(data1, test_number == 'test 1')
## # A tibble: 5 × 8
##   id    name   phone sex_and_age test_number term_1 term_2 term_3
##   <chr> <chr>  <chr> <chr>       <chr>        <dbl>  <dbl>  <dbl>
## 1 1     Mike   134   m_12        test 1          76     84     87
## 2 2     Linda  270   f_13        test 1          88     90     73
## 3 3     Sam    210   m_11        test 1          78     74     80
## 4 4     Esther 617   f_12        test 1          68     75     74
## 5 5     Mary   114   f_14        test 1          65     67     64
# Use select to subset to specific columns

select(data1, name, phone)
## # A tibble: 10 × 2
##    name   phone
##    <chr>  <chr>
##  1 Mike   134  
##  2 Linda  270  
##  3 Sam    210  
##  4 Esther 617  
##  5 Mary   114  
##  6 Mike   134  
##  7 Linda  270  
##  8 Sam    210  
##  9 Esther 617  
## 10 Mary   114
select(data1, starts_with('term'))
## # A tibble: 10 × 3
##    term_1 term_2 term_3
##     <dbl>  <dbl>  <dbl>
##  1     76     84     87
##  2     88     90     73
##  3     78     74     80
##  4     68     75     74
##  5     65     67     64
##  6     85     80     90
##  7     87     82     94
##  8     80     87     80
##  9     70     75     78
## 10     68     70     63

Pipe

# These three are equivalent

filter(data1, test_number == 'test 1')
## # A tibble: 5 × 8
##   id    name   phone sex_and_age test_number term_1 term_2 term_3
##   <chr> <chr>  <chr> <chr>       <chr>        <dbl>  <dbl>  <dbl>
## 1 1     Mike   134   m_12        test 1          76     84     87
## 2 2     Linda  270   f_13        test 1          88     90     73
## 3 3     Sam    210   m_11        test 1          78     74     80
## 4 4     Esther 617   f_12        test 1          68     75     74
## 5 5     Mary   114   f_14        test 1          65     67     64
data1 %>% filter(test_number == 'test 1')
## # A tibble: 5 × 8
##   id    name   phone sex_and_age test_number term_1 term_2 term_3
##   <chr> <chr>  <chr> <chr>       <chr>        <dbl>  <dbl>  <dbl>
## 1 1     Mike   134   m_12        test 1          76     84     87
## 2 2     Linda  270   f_13        test 1          88     90     73
## 3 3     Sam    210   m_11        test 1          78     74     80
## 4 4     Esther 617   f_12        test 1          68     75     74
## 5 5     Mary   114   f_14        test 1          65     67     64
data1 %>% filter(., test_number == 'test 1')
## # A tibble: 5 × 8
##   id    name   phone sex_and_age test_number term_1 term_2 term_3
##   <chr> <chr>  <chr> <chr>       <chr>        <dbl>  <dbl>  <dbl>
## 1 1     Mike   134   m_12        test 1          76     84     87
## 2 2     Linda  270   f_13        test 1          88     90     73
## 3 3     Sam    210   m_11        test 1          78     74     80
## 4 4     Esther 617   f_12        test 1          68     75     74
## 5 5     Mary   114   f_14        test 1          65     67     64
# We can do multiple operations at once with the pipe operator

data1 %>%
  filter(test_number == 'test 1') %>%
  select(name, starts_with('term'))
## # A tibble: 5 × 4
##   name   term_1 term_2 term_3
##   <chr>   <dbl>  <dbl>  <dbl>
## 1 Mike       76     84     87
## 2 Linda      88     90     73
## 3 Sam        78     74     80
## 4 Esther     68     75     74
## 5 Mary       65     67     64
# Equivalent to doing in two stages

data_tmp <- data1 %>% filter(test_number == 'test 1')

data_tmp %>% select(name, starts_with('term'))
## # A tibble: 5 × 4
##   name   term_1 term_2 term_3
##   <chr>   <dbl>  <dbl>  <dbl>
## 1 Mike       76     84     87
## 2 Linda      88     90     73
## 3 Sam        78     74     80
## 4 Esther     68     75     74
## 5 Mary       65     67     64

Reshaping data (longer)

# Lets restructure the data so that there is a single column with all the results
# And the rows represent a person in a given term for a given test

data_long <- data1 %>%
  pivot_longer(starts_with('term'), values_to = 'Results', names_to = 'Term')

data_long
## # A tibble: 30 × 7
##    id    name   phone sex_and_age test_number Term   Results
##    <chr> <chr>  <chr> <chr>       <chr>       <chr>    <dbl>
##  1 1     Mike   134   m_12        test 1      term_1      76
##  2 1     Mike   134   m_12        test 1      term_2      84
##  3 1     Mike   134   m_12        test 1      term_3      87
##  4 2     Linda  270   f_13        test 1      term_1      88
##  5 2     Linda  270   f_13        test 1      term_2      90
##  6 2     Linda  270   f_13        test 1      term_3      73
##  7 3     Sam    210   m_11        test 1      term_1      78
##  8 3     Sam    210   m_11        test 1      term_2      74
##  9 3     Sam    210   m_11        test 1      term_3      80
## 10 4     Esther 617   f_12        test 1      term_1      68
## # ℹ 20 more rows

Reshaping data (wider)

# Lets restructure the data so that there is a separate column for results of test 1 and 2
# And the rows represent a person

data_wider <- data1 %>%
  pivot_wider(values_from = starts_with('term'), names_from = 'test_number')

data_wider
## # A tibble: 5 × 10
##   id    name   phone sex_and_age `term_1_test 1` `term_1_test 2` `term_2_test 1`
##   <chr> <chr>  <chr> <chr>                 <dbl>           <dbl>           <dbl>
## 1 1     Mike   134   m_12                     76              85              84
## 2 2     Linda  270   f_13                     88              87              90
## 3 3     Sam    210   m_11                     78              80              74
## 4 4     Esther 617   f_12                     68              70              75
## 5 5     Mary   114   f_14                     65              68              67
## # ℹ 3 more variables: `term_2_test 2` <dbl>, `term_3_test 1` <dbl>,
## #   `term_3_test 2` <dbl>

Merging / joining

# New dataset

data2 <- tribble(
  ~id, ~name, ~country,
  "1","Mike","UK",
  "2","Linda","USA",
  "3","Sam","Canada",
  "4","Esther","USA",
  "5","Mary","Ireland",
)

kbl(data2, format = 'html') %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
id name country
1 Mike UK
2 Linda USA
3 Sam Canada
4 Esther USA
5 Mary Ireland
# Add this data to the main data, to create a country column

## Using merge

combined_data <- merge(data1, data2, by = c("id", "name"))

## Using left_join

combined_data <- data1 %>%
  left_join(data2, by = c("id", "name"))

combined_data
## # A tibble: 10 × 9
##    id    name   phone sex_and_age test_number term_1 term_2 term_3 country
##    <chr> <chr>  <chr> <chr>       <chr>        <dbl>  <dbl>  <dbl> <chr>  
##  1 1     Mike   134   m_12        test 1          76     84     87 UK     
##  2 2     Linda  270   f_13        test 1          88     90     73 USA    
##  3 3     Sam    210   m_11        test 1          78     74     80 Canada 
##  4 4     Esther 617   f_12        test 1          68     75     74 USA    
##  5 5     Mary   114   f_14        test 1          65     67     64 Ireland
##  6 1     Mike   134   m_12        test 2          85     80     90 UK     
##  7 2     Linda  270   f_13        test 2          87     82     94 USA    
##  8 3     Sam    210   m_11        test 2          80     87     80 Canada 
##  9 4     Esther 617   f_12        test 2          70     75     78 USA    
## 10 5     Mary   114   f_14        test 2          68     70     63 Ireland

Split out sex and age

sex_age <- str_split_fixed(data1$sex_and_age, pattern = "_", n = 2) %>%
  data.frame()

colnames(sex_age) <- c("Sex", "Age")

data3 <- data.frame(data1, sex_age) %>%
  select(-sex_and_age)

data3
##    id   name phone test_number term_1 term_2 term_3 Sex Age
## 1   1   Mike   134      test 1     76     84     87   m  12
## 2   2  Linda   270      test 1     88     90     73   f  13
## 3   3    Sam   210      test 1     78     74     80   m  11
## 4   4 Esther   617      test 1     68     75     74   f  12
## 5   5   Mary   114      test 1     65     67     64   f  14
## 6   1   Mike   134      test 2     85     80     90   m  12
## 7   2  Linda   270      test 2     87     82     94   f  13
## 8   3    Sam   210      test 2     80     87     80   m  11
## 9   4 Esther   617      test 2     70     75     78   f  12
## 10  5   Mary   114      test 2     68     70     63   f  14