library(tidyverse) visitors <- read_csv("data/UK-visitor-numbers.csv")How many tourist attractions are there in the data set?
visitors %>% count()## # A tibble: 1 × 1
##       n
##   <int>
## 1   348Create a frequency table of the number of tourist attractions in
the data set by region.
visitors %>% count(region)## # A tibble: 12 × 2
##    region                       n
##    <chr>                    <int>
##  1 East Midlands                9
##  2 East of England             26
##  3 London                      39
##  4 North East                  25
##  5 North West                  21
##  6 Northern Ireland             9
##  7 Scotland                    97
##  8 South East                  34
##  9 South West                  30
## 10 Wales                        2
## 11 West Midlands               35
## 12 Yorkshire and the Humber    21Create a frequency table by admission and
setting.
visitors %>% count(admission, setting)## # A tibble: 9 × 3
##   admission setting     n
##   <chr>     <chr>   <int>
## 1 Charged   I          21
## 2 Charged   M          12
## 3 Charged   O           5
## 4 Free      I          77
## 5 Free      M           8
## 6 Free      O          43
## 7 Members   I          23
## 8 Members   M         106
## 9 Members   O          53What are the variable data types?
class(visitors$n_2022)## [1] "numeric"visitors %>% summarise_all(class)## # A tibble: 1 × 6
##   attraction n_2021  n_2022  admission setting   region   
##   <chr>      <chr>   <chr>   <chr>     <chr>     <chr>    
## 1 character  numeric numeric character character characterWhich attraction had the most number of visitors in 2022?
visitors %>% arrange(desc(n_2022))## # A tibble: 348 × 6
##    attraction                             n_2021 n_2022 admission setting region
##    <chr>                                   <dbl>  <dbl> <chr>     <chr>   <chr> 
##  1 The Crown Estate, Windsor Great Park   5.40e6 5.64e6 Free      O       South…
##  2 Natural History Museum (South Kensing… 1.57e6 4.65e6 Free      I       London
##  3 The British Museum                     1.33e6 4.10e6 Free      I       London
##  4 Tate Modern                            1.16e6 3.88e6 Free      I       London
##  5 Southbank Centre                       7.87e5 2.95e6 Free      I       London
##  6 The National Gallery                   7.09e5 2.73e6 Free      I       London
##  7 V&A South Kensington                   8.58e5 2.37e6 Free      I       London
##  8 Somerset House                         9.85e5 2.35e6 Free      M       London
##  9 Science Museum                         9.56e5 2.33e6 Free      I       London
## 10 Tower of London                        5.26e5 2.02e6 Members   M       London
## # ℹ 338 more rowsWhat are the top 10 most visited attractions in 2021?
visitors %>% 
  arrange(desc(n_2021)) %>%
  head(n = 10)## # A tibble: 10 × 6
##    attraction                             n_2021 n_2022 admission setting region
##    <chr>                                   <dbl>  <dbl> <chr>     <chr>   <chr> 
##  1 The Crown Estate, Windsor Great Park   5.40e6 5.64e6 Free      O       South…
##  2 Royal Botanic Gardens Kew              1.96e6 1.96e6 Members   M       London
##  3 Natural History Museum (South Kensing… 1.57e6 4.65e6 Free      I       London
##  4 RHS Garden Wisley                      1.41e6 1.49e6 Members   O       South…
##  5 The British Museum                     1.33e6 4.10e6 Free      I       London
##  6 Tate Modern                            1.16e6 3.88e6 Free      I       London
##  7 Somerset House                         9.85e5 2.35e6 Free      M       London
##  8 Science Museum                         9.56e5 2.33e6 Free      I       London
##  9 Jeskyns Community Woodland             8.79e5 4.47e5 Free      O       South…
## 10 V&A South Kensington                   8.58e5 2.37e6 Free      I       LondonWhat is the admission charge for the
"National Museum of Scotland"?
visitors %>% filter(attraction == "National Museum of Scotland")## # A tibble: 1 × 6
##   attraction                  n_2021  n_2022 admission setting region  
##   <chr>                        <dbl>   <dbl> <chr>     <chr>   <chr>   
## 1 National Museum of Scotland 660741 1973751 Free      I       ScotlandWhich attraction had exactly 565,772 visitors in 2022?
visitors %>% filter(n_2022 == 565772)## # A tibble: 1 × 6
##   attraction                        n_2021 n_2022 admission setting region    
##   <chr>                              <dbl>  <dbl> <chr>     <chr>   <chr>     
## 1 Knowsley Safari and Knowsley Hall     NA 565772 Members   M       North WestHow many attraction had more than 1 million visitors in 2022?
visitors %>% 
  filter(n_2022 > 10^6) %>%
  count()## # A tibble: 1 × 1
##       n
##   <int>
## 1    22How many "O"utside attractions are there in the
"Yorkshire and the Humber" region that gives
"Members" free admission, which had more than 100,000
visitors in 2022?
visitors %>%
  filter(
    setting == "O",
    region == "Yorkshire and the Humber",
    admission == "Members",
    n_2022 >= 100000
    ) %>%
  count()## # A tibble: 1 × 1
##       n
##   <int>
## 1     3How many attractions had between 50,000 and 100,000 visitors in 2022?
visitors %>% 
  filter(
    n_2022 > 50000,
    n_2022 < 100000 ) %>%
  count()## # A tibble: 1 × 1
##       n
##   <int>
## 1    50How many regions have more than 50 tourist attraction in the data set? (Hint: You will need to tabulate the data before filtering.)
visitors %>% 
  count(region) %>%
  filter(n > 50)## # A tibble: 1 × 2
##   region       n
##   <chr>    <int>
## 1 Scotland    97What are the mean and median visitor numbers in 2022 across all attractions?
visitors %>% 
  summarise(
    mean_2022 = mean(n_2022),
    med_2022 = median(n_2022)
  )## # A tibble: 1 × 2
##   mean_2022 med_2022
##       <dbl>    <dbl>
## 1   351942.  184640.Perform the same calculation for the 2021 admissions data.
visitors %>% 
  summarise(
    mean_2021 = mean(n_2021),
    med_2021 = median(n_2021)
  )## # A tibble: 1 × 2
##   mean_2021 med_2021
##       <dbl>    <dbl>
## 1        NA       NAAll values in the output are NA!
What does NA stand for and why are you getting this
as your answer to the previous question.
NA stands for Not Available. This means
there is a missing value in the data set. There can be many reasons for
why data might be missing – perhaps the venue does not collect this data
or perhaps they failed to submit their data to the Association of
Leading Visitor Attractions. Alternatively, it might be possible that
the attraction only started in 2022 or perhaps it remained closed in
2021 due COVID-19 risks. It is not possible to determine the correct
reason, and manually specifying a value of 0 would not be
appropriate as it would imply that the attraction was open but nobody
visited!
The mean and median statistics require a full data set in order to be
calculated. If there exists at least one NA then the only
appropriate mean and median calculation would also be
NA.
Look at the help pages for the mean() and
median() commands to see what the input argument
na.rm does. Edit your code from exercise h so that it
computes the summary statistics where data is available.
To open the manual page for the mean() command
either:
mean
in the search box, or?mean on the console (bottom left panel).The input argument na.rm is a logical input that takes
the value TRUE or FALSE. If the value is
TRUE then any entries that are NA are removed
before calculating the statistic.
visitors %>% 
  summarise(
    mean_2021 = mean(n_2021, na.rm = TRUE),
    med_2021 = median(n_2021, na.rm = TRUE)
  )## # A tibble: 1 × 2
##   mean_2021 med_2021
##       <dbl>    <dbl>
## 1   232431.   129829Which setting (inside, outside or mixed) has the largest mean visitor numbers in 2022?
visitors %>% 
  group_by(setting) %>%
  summarise(
    mean_2022 = mean(n_2022),
    med_2022 = median(n_2022)
  )## # A tibble: 3 × 3
##   setting mean_2022 med_2022
##   <chr>       <dbl>    <dbl>
## 1 I         488522.  204823 
## 2 M         284800.  198374.
## 3 O         272077.  127708Observe in question 6 that the mean statistics across all settings are much larger than the corresponding median statistics. Discuss in your group what this suggests about the data.
A reminder of the definitions:
The mean statistic is a very well understood summary of data by a wide range of people, however it has one important limitation - it is highly sensitive to very extreme values compared to the other values of the data. For example:
x <- c(1, 2, 3, 4, 5, 6, 7*10^9)
mean(x)      # about 10^9## [1] 1e+09median(x)    # 4## [1] 4Clearly the last value of 7 billion is very extreme compared to the other values, which therefore pulls the mean statistic towards a larger value of about 1 billion. However, the median statistic does not care if the last value is 7 or 7 billion.
In terms of the visitor numbers, a larger mean statistic compared to the median statistic implies that the range of the upper half of the data set is wider than the width of the lower half. In otherwords, the shape of the visitor data is positively skewed.
What is the interquartile range (the width of the middle 50% of data set between the lower and upper quartiles) the for each of the four nations of the UK?
visitors_with_nations <- visitors %>%
  mutate(
    nation = case_when(
      region == "Northern Ireland" ~ "Northern Ireland",
      region == "Scotland" ~ "Scotland",
      region == "Wales" ~ "Wales",
      TRUE ~ "England"
    )
  )
visitors_with_nations %>% 
  group_by(nation) %>%
  summarise(
    IQR_2022 = IQR(n_2022)
  )## # A tibble: 4 × 2
##   nation           IQR_2022
##   <chr>               <dbl>
## 1 England           350362.
## 2 Northern Ireland  311046 
## 3 Scotland          127986 
## 4 Wales             103368.How many tourist attractions are there in each of the 4 nations? From this, discuss in your group how reliable you think the inter-quartile estimates are.
visitors_with_nations %>%
  count(nation)## # A tibble: 4 × 2
##   nation               n
##   <chr>            <int>
## 1 England            240
## 2 Northern Ireland     9
## 3 Scotland            97
## 4 Wales                2There are only 2 attractions in Wales. This is a very low sample size, meaning that the inter-quartile range is not a very accurate estimate.
Within each of the 4 nations, what is the proportion of tourist attractions that have free admission for all visitors?
visitors_with_nations %>%
  group_by(nation) %>% 
  count(admission) %>%
  mutate(pct = 100 * n / sum(n)) %>%
  filter(admission == "Free") %>%
  arrange(desc(pct))## # A tibble: 3 × 4
## # Groups:   nation [3]
##   nation           admission     n   pct
##   <chr>            <chr>     <int> <dbl>
## 1 England          Free         93  38.8
## 2 Scotland         Free         34  35.1
## 3 Northern Ireland Free          1  11.1Calculate the percentage change in visitor admissions from 2021 to 2022. Of the tourist attractions in Scotland, sort into increasing numerical order the types of admission charges based on the mean percentage change in visitor numbers.
visitors_with_nations %>%
  mutate(pct_change = (n_2022-n_2021)/n_2021) %>%
  filter(nation == "Scotland") %>%
  group_by(admission) %>%
  summarise(mean_pct_change = mean(pct_change, na.rm = TRUE)) %>%
  arrange(mean_pct_change)## # A tibble: 3 × 2
##   admission mean_pct_change
##   <chr>               <dbl>
## 1 Charged            0.0465
## 2 Members            1.38  
## 3 Free               3.03