library(tidyverse)
library(ggplot2)
diamonds <- ggplot2::diamonds %>% rename(
depth_pct = depth,
length_mm = x,
width_mm = y,
depth_mm = z,
table_pct = table
)
ggplot(data = diamonds,
mapping = aes(x = width_mm,
y = depth_mm)) +
geom_point()
There are some diamonds with a dimension lengths of 0mm and some with at least one dimension greater than 20mm. These should be filtered out of the data set.
diamonds <- diamonds %>% filter(
length_mm != 0, length_mm <= 20,
width_mm != 0, width_mm <= 20,
depth_mm != 0, depth_mm <= 20
)
The remaining EDA is non exhaustive, you can find other analysis in this RPubs article by David Curtis.
diamonds %>% count(cut)
## # A tibble: 5 × 2
## cut n
## <ord> <int>
## 1 Fair 1609
## 2 Good 4902
## 3 Very Good 12080
## 4 Premium 13779
## 5 Ideal 21547
diamonds %>% count(clarity)
## # A tibble: 8 × 2
## clarity n
## <ord> <int>
## 1 I1 738
## 2 SI2 9184
## 3 SI1 13063
## 4 VS2 12254
## 5 VS1 8168
## 6 VVS2 5066
## 7 VVS1 3654
## 8 IF 1790
ggplot(diamonds) + geom_histogram(aes(x = carat))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(diamonds) + geom_point(aes(x = carat,y=price,color=cut),alpha = 0.4)
diamonds <- diamonds %>%
mutate(table_mm = table_pct * width_mm / 100)
# Overall table_mm arithmetic mean?
diamonds %>%
summarise(
avg_table = mean(table_mm, na.rm = TRUE)
)
## # A tibble: 1 × 1
## avg_table
## <dbl>
## 1 3.30
# table_mm arithmetic mean within each clarity category
diamonds %>%
group_by(clarity) %>%
summarise(
avg_table = mean(table_mm, na.rm = TRUE)
)
## # A tibble: 8 × 2
## clarity avg_table
## <ord> <dbl>
## 1 I1 3.91
## 2 SI2 3.71
## 3 SI1 3.40
## 4 VS2 3.25
## 5 VS1 3.20
## 6 VVS2 2.98
## 7 VVS1 2.83
## 8 IF 2.82
Diamonds with a better clarity category have, on average, a smaller table length.
diamonds %>%
summarise(across(where(is.numeric),
function(x) mean(x, na.rm = TRUE),
.names = "avg_{.col}"))
## # A tibble: 1 × 8
## avg_carat avg_depth_pct avg_table_pct avg_price avg_length_mm avg_width_mm
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.798 61.7 57.5 3931. 5.73 5.73
## # ℹ 2 more variables: avg_depth_mm <dbl>, avg_table_mm <dbl>
# Write gmean() function
gmean <- function(x){
xbar_g <- exp(mean(log(x)))
return(xbar_g)
}
test_data <- list(
test1 = c(2.1, 3.8, 4.2),
test2 = c(1, 10, 100, 1000),
test3 = c(0, 1, 4),
test4 = c(0.38, 0.94, -1.56),
test5 = c(TRUE, TRUE),
test6 = c("6", "7", "8")
)
# Create for loop to test gmean() on the above examples
for(i in seq_along(test_data)){
print(gmean(test_data[[i]]))
}
## [1] 3.224166
## [1] 31.62278
## [1] 0
## Warning in log(x): NaNs produced
## [1] NaN
## [1] 1
## Error in log(x): non-numeric argument to mathematical function
# Copy gmean() from Ex3 and edit it to check the input
gmean <- function(x){
if(!is.numeric(x)){
warning("Vector 'x' is not numeric")
return(NaN)
}
if(any(x <= 0)){
warning("Vector 'x' must have strictly positive numbers.")
return(NaN)
}
xbar_g <- exp(mean(log(x)))
return(xbar_g)
}
# Copy your code from Ex 4 to test your new command
for(i in seq_along(test_data)){
print(gmean(test_data[[i]]))
}
## [1] 3.224166
## [1] 31.62278
## Warning in gmean(test_data[[i]]): Vector 'x' must have strictly positive
## numbers.
## [1] NaN
## Warning in gmean(test_data[[i]]): Vector 'x' must have strictly positive
## numbers.
## [1] NaN
## Warning in gmean(test_data[[i]]): Vector 'x' is not numeric
## [1] NaN
## Warning in gmean(test_data[[i]]): Vector 'x' is not numeric
## [1] NaN
# replicate the for loop from Ex 4 using the map (or a similar) function
map(test_data, gmean)
## Warning in .f(.x[[i]], ...): Vector 'x' must have strictly positive numbers.
## Warning in .f(.x[[i]], ...): Vector 'x' must have strictly positive numbers.
## Warning in .f(.x[[i]], ...): Vector 'x' is not numeric
## Warning in .f(.x[[i]], ...): Vector 'x' is not numeric
## $test1
## [1] 3.224166
##
## $test2
## [1] 31.62278
##
## $test3
## [1] NaN
##
## $test4
## [1] NaN
##
## $test5
## [1] NaN
##
## $test6
## [1] NaN
map_dbl(test_data, gmean)
## Warning in .f(.x[[i]], ...): Vector 'x' must have strictly positive numbers.
## Warning in .f(.x[[i]], ...): Vector 'x' must have strictly positive numbers.
## Warning in .f(.x[[i]], ...): Vector 'x' is not numeric
## Warning in .f(.x[[i]], ...): Vector 'x' is not numeric
## test1 test2 test3 test4 test5 test6
## 3.224166 31.622777 NaN NaN NaN NaN
# Overall table_mm arithmetic mean, median, and geometic mean?
diamonds %>%
summarise(
avg_table = mean(table_mm),
med_table = median(table_mm),
geom_table = gmean(table_mm)
)
## # A tibble: 1 × 3
## avg_table med_table geom_table
## <dbl> <dbl> <dbl>
## 1 3.30 3.25 3.23
# Arithmetic mean, median, and geometric mean of table_mm within each clarity category
diamonds %>%
group_by(clarity) %>%
summarise(
avg_table = mean(table_mm),
med_table = median(table_mm),
geom_table = gmean(table_mm)
)
## # A tibble: 8 × 4
## clarity avg_table med_table geom_table
## <ord> <dbl> <dbl> <dbl>
## 1 I1 3.91 3.86 3.86
## 2 SI2 3.71 3.72 3.65
## 3 SI1 3.40 3.41 3.34
## 4 VS2 3.25 3.16 3.19
## 5 VS1 3.20 3.08 3.14
## 6 VVS2 2.98 2.82 2.93
## 7 VVS1 2.83 2.66 2.79
## 8 IF 2.82 2.59 2.78
Diamonds with a better clarity have a smaller geometric mean. Also, the geometric means are typically smaller than the arithmetic mean for this data.
# Create gmean2() that computes the geometric mean
# that uses the original definition
gmean2 <- function(x){
if(!is.numeric(x)){
warning("Vector 'x' is not numeric")
return(NaN)
}
if(any(x <= 0)){
warning("Vector 'x' must have strictly positive numbers.")
return(NaN)
}
xbar_g <- prod(x)^(1/length(x))
return(xbar_g)
}
gmean(test_data$test1)
## [1] 3.224166
gmean2(test_data$test1)
## [1] 3.224166
diamonds %>%
summarise(
gmean_table = gmean(table_mm),
gmean2_table = gmean2(table_mm)
)
## # A tibble: 1 × 2
## gmean_table gmean2_table
## <dbl> <dbl>
## 1 3.23 Inf
The command gmean2()
when applied to
diamonds
does not work as it is multiplying 53917 numbers
that are all greater 1. Consequently, the product is exceptionally big -
bigger than what R can represent as a floating point number.
Consequently, the product is then represented by Inf
. By
contrast, through transforming the definition of the geometric mean
using the exponential & natural logarithm functions obtains a
definition that is more stable for computers to compute.
# Create a function that computes the harmonic mean
hmean <- function(x){
if(!is.numeric(x)){
warning("Vector 'x' is not numeric")
return(NaN)
}
if(any(x <= 0)){
warning("Vector 'x' must have strictly positive numbers.")
return(NaN)
}
xbar_h <- 1 / mean(1 / x)
return(xbar_h)
}
diamonds %>%
summarise(
avg_table = hmean(table_mm)
)
## # A tibble: 1 × 1
## avg_table
## <dbl>
## 1 3.17
diamonds %>%
group_by(clarity) %>%
summarise(
avg_table = hmean(table_mm)
)
## # A tibble: 8 × 2
## clarity avg_table
## <ord> <dbl>
## 1 I1 3.81
## 2 SI2 3.59
## 3 SI1 3.28
## 4 VS2 3.13
## 5 VS1 3.08
## 6 VVS2 2.89
## 7 VVS1 2.76
## 8 IF 2.74
\[ \begin{align} \bar{x}_g &= \left(\prod_{i=1}^n x_i\right)^{1/n}\\ & = \exp\left\{ \ln\left[\left(\prod_{i=1}^n x_i\right)^{1/n}\right]\right\} \quad\quad[\exp(\ln a) = a \text{ for }a>0]\\ & = \exp\left\{ \frac{1}{n}\ln\left(\prod_{i=1}^n x_i\right)\right\}\quad\quad[\text{power rule: }\ln(a^b) = b \ln a]\\ & = \exp\left\{ \frac{1}{n}\sum_{i=1}^n\ln x_i \right\}\quad\quad[\text{product rule: }\ln(ab) = \ln a + \ln b]\\ \end{align} \]