Packages and Data

library(tidyverse) 
library(ggplot2)
diamonds <- ggplot2::diamonds %>% rename(
    depth_pct = depth,
    length_mm = x,
    width_mm = y,
    depth_mm = z,
    table_pct = table
  )

Exercise 1

ggplot(data = diamonds,
       mapping = aes(x = width_mm,
                     y = depth_mm)) +
  geom_point()

There are some diamonds with a dimension lengths of 0mm and some with at least one dimension greater than 20mm. These should be filtered out of the data set.

diamonds <- diamonds %>% filter(
  length_mm != 0, length_mm <= 20,
  width_mm != 0, width_mm <= 20,
  depth_mm != 0, depth_mm <= 20
)

The remaining EDA is non exhaustive, you can find other analysis in this RPubs article by David Curtis.

diamonds %>% count(cut)
## # A tibble: 5 × 2
##   cut           n
##   <ord>     <int>
## 1 Fair       1609
## 2 Good       4902
## 3 Very Good 12080
## 4 Premium   13779
## 5 Ideal     21547
diamonds %>% count(clarity)
## # A tibble: 8 × 2
##   clarity     n
##   <ord>   <int>
## 1 I1        738
## 2 SI2      9184
## 3 SI1     13063
## 4 VS2     12254
## 5 VS1      8168
## 6 VVS2     5066
## 7 VVS1     3654
## 8 IF       1790
ggplot(diamonds) + geom_histogram(aes(x = carat))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(diamonds) + geom_point(aes(x = carat,y=price,color=cut),alpha = 0.4)

Exercise 2

diamonds <- diamonds %>%
  mutate(table_mm = table_pct * width_mm / 100)

# Overall table_mm arithmetic mean?

diamonds %>% 
  summarise(
    avg_table = mean(table_mm, na.rm = TRUE)
  )
## # A tibble: 1 × 1
##   avg_table
##       <dbl>
## 1      3.30
# table_mm arithmetic mean within each clarity category

diamonds %>% 
  group_by(clarity) %>%
  summarise(
    avg_table = mean(table_mm, na.rm = TRUE)
  )
## # A tibble: 8 × 2
##   clarity avg_table
##   <ord>       <dbl>
## 1 I1           3.91
## 2 SI2          3.71
## 3 SI1          3.40
## 4 VS2          3.25
## 5 VS1          3.20
## 6 VVS2         2.98
## 7 VVS1         2.83
## 8 IF           2.82

Diamonds with a better clarity category have, on average, a smaller table length.

diamonds %>% 
  summarise(across(where(is.numeric),
                   function(x) mean(x, na.rm = TRUE),
                   .names = "avg_{.col}"))
## # A tibble: 1 × 8
##   avg_carat avg_depth_pct avg_table_pct avg_price avg_length_mm avg_width_mm
##       <dbl>         <dbl>         <dbl>     <dbl>         <dbl>        <dbl>
## 1     0.798          61.7          57.5     3931.          5.73         5.73
## # ℹ 2 more variables: avg_depth_mm <dbl>, avg_table_mm <dbl>

Exercise 3

# Write gmean() function

gmean <- function(x){
  xbar_g <- exp(mean(log(x)))
  return(xbar_g)
}

Exercise 4

test_data <- list(
  test1 = c(2.1, 3.8, 4.2),
  test2 = c(1, 10, 100, 1000),
  test3 = c(0, 1, 4),
  test4 = c(0.38,  0.94, -1.56),
  test5 = c(TRUE, TRUE),
  test6 = c("6", "7", "8")
)


# Create for loop to test gmean() on the above examples
for(i in seq_along(test_data)){
  print(gmean(test_data[[i]]))
}
## [1] 3.224166
## [1] 31.62278
## [1] 0
## Warning in log(x): NaNs produced
## [1] NaN
## [1] 1
## Error in log(x): non-numeric argument to mathematical function

Exercise 5

# Copy gmean() from Ex3 and edit it to check the input

gmean <- function(x){
  
  if(!is.numeric(x)){
    warning("Vector 'x' is not numeric")
    return(NaN)
  }
    
  if(any(x <= 0)){
    warning("Vector 'x' must have strictly positive numbers.")
    return(NaN)
  }
  
  xbar_g <- exp(mean(log(x)))
  return(xbar_g)
}



# Copy your code from Ex 4 to test your new command


for(i in seq_along(test_data)){
  print(gmean(test_data[[i]]))
}
## [1] 3.224166
## [1] 31.62278
## Warning in gmean(test_data[[i]]): Vector 'x' must have strictly positive
## numbers.
## [1] NaN
## Warning in gmean(test_data[[i]]): Vector 'x' must have strictly positive
## numbers.
## [1] NaN
## Warning in gmean(test_data[[i]]): Vector 'x' is not numeric
## [1] NaN
## Warning in gmean(test_data[[i]]): Vector 'x' is not numeric
## [1] NaN

Exercise 6

# replicate the for loop from Ex 4 using the map (or a similar) function
map(test_data, gmean)
## Warning in .f(.x[[i]], ...): Vector 'x' must have strictly positive numbers.
## Warning in .f(.x[[i]], ...): Vector 'x' must have strictly positive numbers.
## Warning in .f(.x[[i]], ...): Vector 'x' is not numeric
## Warning in .f(.x[[i]], ...): Vector 'x' is not numeric
## $test1
## [1] 3.224166
## 
## $test2
## [1] 31.62278
## 
## $test3
## [1] NaN
## 
## $test4
## [1] NaN
## 
## $test5
## [1] NaN
## 
## $test6
## [1] NaN
map_dbl(test_data, gmean)
## Warning in .f(.x[[i]], ...): Vector 'x' must have strictly positive numbers.
## Warning in .f(.x[[i]], ...): Vector 'x' must have strictly positive numbers.
## Warning in .f(.x[[i]], ...): Vector 'x' is not numeric
## Warning in .f(.x[[i]], ...): Vector 'x' is not numeric
##     test1     test2     test3     test4     test5     test6 
##  3.224166 31.622777       NaN       NaN       NaN       NaN

Exercise 7

# Overall table_mm arithmetic mean, median, and geometic mean?
diamonds %>% 
  summarise(
    avg_table = mean(table_mm),
    med_table = median(table_mm),
    geom_table = gmean(table_mm)
  )
## # A tibble: 1 × 3
##   avg_table med_table geom_table
##       <dbl>     <dbl>      <dbl>
## 1      3.30      3.25       3.23
# Arithmetic mean, median, and geometric mean of table_mm within each clarity category
diamonds %>% 
  group_by(clarity) %>%
  summarise(
    avg_table = mean(table_mm),
    med_table = median(table_mm),
    geom_table = gmean(table_mm)
  )
## # A tibble: 8 × 4
##   clarity avg_table med_table geom_table
##   <ord>       <dbl>     <dbl>      <dbl>
## 1 I1           3.91      3.86       3.86
## 2 SI2          3.71      3.72       3.65
## 3 SI1          3.40      3.41       3.34
## 4 VS2          3.25      3.16       3.19
## 5 VS1          3.20      3.08       3.14
## 6 VVS2         2.98      2.82       2.93
## 7 VVS1         2.83      2.66       2.79
## 8 IF           2.82      2.59       2.78

Diamonds with a better clarity have a smaller geometric mean. Also, the geometric means are typically smaller than the arithmetic mean for this data.

Exercise 8 (optional)

# Create gmean2() that computes the geometric mean 
#   that uses the original definition

gmean2 <- function(x){
  
  if(!is.numeric(x)){
    warning("Vector 'x' is not numeric")
    return(NaN)
  }
    
  if(any(x <= 0)){
    warning("Vector 'x' must have strictly positive numbers.")
    return(NaN)
  }
  
  xbar_g <- prod(x)^(1/length(x))
  return(xbar_g)
}

gmean(test_data$test1)
## [1] 3.224166
gmean2(test_data$test1)
## [1] 3.224166
diamonds %>% 
  summarise(
    gmean_table = gmean(table_mm),
    gmean2_table = gmean2(table_mm)
  )
## # A tibble: 1 × 2
##   gmean_table gmean2_table
##         <dbl>        <dbl>
## 1        3.23          Inf

The command gmean2() when applied to diamonds does not work as it is multiplying 53917 numbers that are all greater 1. Consequently, the product is exceptionally big - bigger than what R can represent as a floating point number. Consequently, the product is then represented by Inf. By contrast, through transforming the definition of the geometric mean using the exponential & natural logarithm functions obtains a definition that is more stable for computers to compute.

Exercise 9 (optional)

# Create a function that computes the harmonic mean

hmean <- function(x){
  
  if(!is.numeric(x)){
    warning("Vector 'x' is not numeric")
    return(NaN)
  }
    
  if(any(x <= 0)){
    warning("Vector 'x' must have strictly positive numbers.")
    return(NaN)
  }
  
  xbar_h <- 1 / mean(1 / x)
  return(xbar_h)
}

diamonds %>% 
  summarise(
    avg_table = hmean(table_mm)
  )
## # A tibble: 1 × 1
##   avg_table
##       <dbl>
## 1      3.17
diamonds %>% 
  group_by(clarity) %>%
  summarise(
    avg_table = hmean(table_mm)
  )
## # A tibble: 8 × 2
##   clarity avg_table
##   <ord>       <dbl>
## 1 I1           3.81
## 2 SI2          3.59
## 3 SI1          3.28
## 4 VS2          3.13
## 5 VS1          3.08
## 6 VVS2         2.89
## 7 VVS1         2.76
## 8 IF           2.74

Exercise 10 (optional)

\[ \begin{align} \bar{x}_g &= \left(\prod_{i=1}^n x_i\right)^{1/n}\\ & = \exp\left\{ \ln\left[\left(\prod_{i=1}^n x_i\right)^{1/n}\right]\right\} \quad\quad[\exp(\ln a) = a \text{ for }a>0]\\ & = \exp\left\{ \frac{1}{n}\ln\left(\prod_{i=1}^n x_i\right)\right\}\quad\quad[\text{power rule: }\ln(a^b) = b \ln a]\\ & = \exp\left\{ \frac{1}{n}\sum_{i=1}^n\ln x_i \right\}\quad\quad[\text{product rule: }\ln(ab) = \ln a + \ln b]\\ \end{align} \]