Load packages and data

library(tidyverse) 
visitors <- read_csv("data/UK-visitor-numbers.csv")

Wrangling Data

Question 1

How many tourist attractions are there in the data set?

visitors %>% count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1   348

Exercise a.

Create a frequency table of the number of tourist attractions in the data set by region.

visitors %>% count(region)
## # A tibble: 12 × 2
##    region                       n
##    <chr>                    <int>
##  1 East Midlands                9
##  2 East of England             26
##  3 London                      39
##  4 North East                  25
##  5 North West                  21
##  6 Northern Ireland             9
##  7 Scotland                    97
##  8 South East                  34
##  9 South West                  30
## 10 Wales                        2
## 11 West Midlands               35
## 12 Yorkshire and the Humber    21

Exercise b.

Create a frequency table by admission and setting.

visitors %>% count(admission, setting)
## # A tibble: 9 × 3
##   admission setting     n
##   <chr>     <chr>   <int>
## 1 Charged   I          21
## 2 Charged   M          12
## 3 Charged   O           5
## 4 Free      I          77
## 5 Free      M           8
## 6 Free      O          43
## 7 Members   I          23
## 8 Members   M         106
## 9 Members   O          53

Question 2

What are the variable data types?

class(visitors$n_2022)
## [1] "numeric"
visitors %>% summarise_all(class)
## # A tibble: 1 × 6
##   attraction n_2021  n_2022  admission setting   region   
##   <chr>      <chr>   <chr>   <chr>     <chr>     <chr>    
## 1 character  numeric numeric character character character

Question 3

Which attraction had the most number of visitors in 2022?

visitors %>% arrange(desc(n_2022))
## # A tibble: 348 × 6
##    attraction                             n_2021 n_2022 admission setting region
##    <chr>                                   <dbl>  <dbl> <chr>     <chr>   <chr> 
##  1 The Crown Estate, Windsor Great Park   5.40e6 5.64e6 Free      O       South…
##  2 Natural History Museum (South Kensing… 1.57e6 4.65e6 Free      I       London
##  3 The British Museum                     1.33e6 4.10e6 Free      I       London
##  4 Tate Modern                            1.16e6 3.88e6 Free      I       London
##  5 Southbank Centre                       7.87e5 2.95e6 Free      I       London
##  6 The National Gallery                   7.09e5 2.73e6 Free      I       London
##  7 V&A South Kensington                   8.58e5 2.37e6 Free      I       London
##  8 Somerset House                         9.85e5 2.35e6 Free      M       London
##  9 Science Museum                         9.56e5 2.33e6 Free      I       London
## 10 Tower of London                        5.26e5 2.02e6 Members   M       London
## # ℹ 338 more rows

Exercise c.

What are the top 10 most visited attractions in 2021?

visitors %>% 
  arrange(desc(n_2021)) %>%
  head(n = 10)
## # A tibble: 10 × 6
##    attraction                             n_2021 n_2022 admission setting region
##    <chr>                                   <dbl>  <dbl> <chr>     <chr>   <chr> 
##  1 The Crown Estate, Windsor Great Park   5.40e6 5.64e6 Free      O       South…
##  2 Royal Botanic Gardens Kew              1.96e6 1.96e6 Members   M       London
##  3 Natural History Museum (South Kensing… 1.57e6 4.65e6 Free      I       London
##  4 RHS Garden Wisley                      1.41e6 1.49e6 Members   O       South…
##  5 The British Museum                     1.33e6 4.10e6 Free      I       London
##  6 Tate Modern                            1.16e6 3.88e6 Free      I       London
##  7 Somerset House                         9.85e5 2.35e6 Free      M       London
##  8 Science Museum                         9.56e5 2.33e6 Free      I       London
##  9 Jeskyns Community Woodland             8.79e5 4.47e5 Free      O       South…
## 10 V&A South Kensington                   8.58e5 2.37e6 Free      I       London

Question 4

What is the admission charge for the "National Museum of Scotland"?

visitors %>% filter(attraction == "National Museum of Scotland")
## # A tibble: 1 × 6
##   attraction                  n_2021  n_2022 admission setting region  
##   <chr>                        <dbl>   <dbl> <chr>     <chr>   <chr>   
## 1 National Museum of Scotland 660741 1973751 Free      I       Scotland

Exercise d.

Which attraction had exactly 565,772 visitors in 2022?

visitors %>% filter(n_2022 == 565772)
## # A tibble: 1 × 6
##   attraction                        n_2021 n_2022 admission setting region    
##   <chr>                              <dbl>  <dbl> <chr>     <chr>   <chr>     
## 1 Knowsley Safari and Knowsley Hall     NA 565772 Members   M       North West

Exercise e.

How many attraction had more than 1 million visitors in 2022?

visitors %>% 
  filter(n_2022 > 10^6) %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1    22

Question 5

How many "O"utside attractions are there in the "Yorkshire and the Humber" region that gives "Members" free admission, which had more than 100,000 visitors in 2022?

visitors %>%
  filter(
    setting == "O",
    region == "Yorkshire and the Humber",
    admission == "Members",
    n_2022 >= 100000
    ) %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1     3

Exercise f.

How many attractions had between 50,000 and 100,000 visitors in 2022?

visitors %>% 
  filter(
    n_2022 > 50000,
    n_2022 < 100000 ) %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1    50

Exercise g.

How many regions have more than 50 tourist attraction in the data set? (Hint: You will need to tabulate the data before filtering.)

visitors %>% 
  count(region) %>%
  filter(n > 50)
## # A tibble: 1 × 2
##   region       n
##   <chr>    <int>
## 1 Scotland    97

Summarising Data

Question 6

What are the mean and median visitor numbers in 2022 across all attractions?

visitors %>% 
  summarise(
    mean_2022 = mean(n_2022),
    med_2022 = median(n_2022)
  )
## # A tibble: 1 × 2
##   mean_2022 med_2022
##       <dbl>    <dbl>
## 1   351942.  184640.

Exercise h.

Perform the same calculation for the 2021 admissions data.

visitors %>% 
  summarise(
    mean_2021 = mean(n_2021),
    med_2021 = median(n_2021)
  )
## # A tibble: 1 × 2
##   mean_2021 med_2021
##       <dbl>    <dbl>
## 1        NA       NA

All values in the output are NA!

Exercise i.

What does NA stand for and why are you getting this as your answer to the previous question.

NA stands for Not Available. This means there is a missing value in the data set. There can be many reasons for why data might be missing – perhaps the venue does not collect this data or perhaps they failed to submit their data to the Association of Leading Visitor Attractions. Alternatively, it might be possible that the attraction only started in 2022 or perhaps it remained closed in 2021 due COVID-19 risks. It is not possible to determine the correct reason, and manually specifying a value of 0 would not be appropriate as it would imply that the attraction was open but nobody visited!

The mean and median statistics require a full data set in order to be calculated. If there exists at least one NA then the only appropriate mean and median calculation would also be NA.

Exercise j.

Look at the help pages for the mean() and median() commands to see what the input argument na.rm does. Edit your code from exercise h so that it computes the summary statistics where data is available.

To open the manual page for the mean() command either:

  • Go to the ‘help’ tab (bottom right panel) and type mean in the search box, or
  • Run ?mean on the console (bottom left panel).

The input argument na.rm is a logical input that takes the value TRUE or FALSE. If the value is TRUE then any entries that are NA are removed before calculating the statistic.

visitors %>% 
  summarise(
    mean_2021 = mean(n_2021, na.rm = TRUE),
    med_2021 = median(n_2021, na.rm = TRUE)
  )
## # A tibble: 1 × 2
##   mean_2021 med_2021
##       <dbl>    <dbl>
## 1   232431.   129829

Question 7

Which setting (inside, outside or mixed) has the largest mean visitor numbers in 2022?

visitors %>% 
  group_by(setting) %>%
  summarise(
    mean_2022 = mean(n_2022),
    med_2022 = median(n_2022)
  )
## # A tibble: 3 × 3
##   setting mean_2022 med_2022
##   <chr>       <dbl>    <dbl>
## 1 I         488522.  204823 
## 2 M         284800.  198374.
## 3 O         272077.  127708

Exercise k.

Observe in question 6 that the mean statistics across all settings are much larger than the corresponding median statistics. Discuss in your group what this suggests about the data.

A reminder of the definitions:

  • The mean is the average value of the data and it is calculated by adding together all of the numbers and dividing by the total.
  • The median is the mid-point in the data where 50% of the data are larger than the median statistic, and 50% of the data are lower.

The mean statistic is a very well understood summary of data by a wide range of people, however it has one important limitation - it is highly sensitive to very extreme values compared to the other values of the data. For example:

x <- c(1, 2, 3, 4, 5, 6, 7*10^9)
mean(x)      # about 10^9
## [1] 1e+09
median(x)    # 4
## [1] 4

Clearly the last value of 7 billion is very extreme compared to the other values, which therefore pulls the mean statistic towards a larger value of about 1 billion. However, the median statistic does not care if the last value is 7 or 7 billion.

In terms of the visitor numbers, a larger mean statistic compared to the median statistic implies that the range of the upper half of the data set is wider than the width of the lower half. In otherwords, the shape of the visitor data is positively skewed.

Question 8

What is the interquartile range (the width of the middle 50% of data set between the lower and upper quartiles) the for each of the four nations of the UK?

visitors_with_nations <- visitors %>%
  mutate(
    nation = case_when(
      region == "Northern Ireland" ~ "Northern Ireland",
      region == "Scotland" ~ "Scotland",
      region == "Wales" ~ "Wales",
      TRUE ~ "England"
    )
  )

visitors_with_nations %>% 
  group_by(nation) %>%
  summarise(
    IQR_2022 = IQR(n_2022)
  )
## # A tibble: 4 × 2
##   nation           IQR_2022
##   <chr>               <dbl>
## 1 England           350362.
## 2 Northern Ireland  311046 
## 3 Scotland          127986 
## 4 Wales             103368.

Exercise l.

How many tourist attractions are there in each of the 4 nations? From this, discuss in your group how reliable you think the inter-quartile estimates are.

visitors_with_nations %>%
  count(nation)
## # A tibble: 4 × 2
##   nation               n
##   <chr>            <int>
## 1 England            240
## 2 Northern Ireland     9
## 3 Scotland            97
## 4 Wales                2

There are only 2 attractions in Wales. This is a very low sample size, meaning that the inter-quartile range is not a very accurate estimate.

Challenging Exercises

Exercise m.

Within each of the 4 nations, what is the proportion of tourist attractions that have free admission for all visitors?

visitors_with_nations %>%
  group_by(nation) %>% 
  count(admission) %>%
  mutate(pct = 100 * n / sum(n)) %>%
  filter(admission == "Free") %>%
  arrange(desc(pct))
## # A tibble: 3 × 4
## # Groups:   nation [3]
##   nation           admission     n   pct
##   <chr>            <chr>     <int> <dbl>
## 1 England          Free         93  38.8
## 2 Scotland         Free         34  35.1
## 3 Northern Ireland Free          1  11.1

Exercise n.

Calculate the percentage change in visitor admissions from 2021 to 2022. Of the tourist attractions in Scotland, sort into increasing numerical order the types of admission charges based on the mean percentage change in visitor numbers.

visitors_with_nations %>%
  mutate(pct_change = (n_2022-n_2021)/n_2021) %>%
  filter(nation == "Scotland") %>%
  group_by(admission) %>%
  summarise(mean_pct_change = mean(pct_change, na.rm = TRUE)) %>%
  arrange(mean_pct_change)
## # A tibble: 3 × 2
##   admission mean_pct_change
##   <chr>               <dbl>
## 1 Charged            0.0465
## 2 Members            1.38  
## 3 Free               3.03