Today we will discuss Exploratory Data Analysis (EDA).

This is the process of exploring your data using visualization and transformations and modeling (will discuss modeling more later).

library(tidyverse)

Lets take a look at the diamonds data set and the variable carat.

diamonds
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

diamonds %>% 
  count(cut_width(carat, 0.5))

Looking at the smaller diamonds.

smaller <- diamonds %>% 
  filter(carat < 3)
  
diamonds %>% ggplot(mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.1)

Look at carat by cut.

smaller %>% ggplot(mapping = aes(x = carat, colour = cut)) +
  geom_freqpoly(binwidth = 0.1)

Looking for typical values.

smaller %>% ggplot(mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.01)

Looking for unusual values. Lets look at the y variable.

diamonds %>% ggplot(mapping = aes(x = y)) + 
  geom_histogram(binwidth = 0.5)

Are there outliers?

diamonds %>% ggplot(mapping = aes(x = y)) + 
  geom_histogram(binwidth = 0.5) +
  coord_cartesian(ylim = c(0, 50))

Lets find the outliers.

unusual <- diamonds %>% 
  filter(y < 3 | y > 20) %>% 
  select(price, x, y, z) %>%
  arrange(y)
unusual

Remove outliers.

diamonds2 <- diamonds %>% 
  filter(between(y, 3, 20))

Better to convert them to NA, which means not available.

diamonds2 <- diamonds %>% 
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

Scatterplots.

diamonds2 %>% ggplot(mapping = aes(x = x, y = y)) + 
  geom_point()

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + 
  geom_point(na.rm = TRUE)

Categorical variable. cut

diamonds %>% ggplot(mapping = aes(x = cut)) + 
  geom_bar()

Continuous variable. price

diamonds %>% ggplot(mapping = aes(x = price, y = ..density..)) + 
  geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)

Putting them together in one plot.

diamonds %>% ggplot(mapping = aes(x = cut, y = price)) +
  geom_boxplot()

For a different data set. mpg

mpg %>% ggplot(mapping = aes(x = class, y = hwy)) +
  geom_boxplot()

Re-order.

mpg %>% ggplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
  geom_boxplot()

Flip.

mpg %>% ggplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
  geom_boxplot() +
  coord_flip()

LS0tCnRpdGxlOiAiRXhwbG9yYXRvcnlEYXRhQW5hbHlzaXMiCmF1dGhvcjogIlByb2YuIEVyaWMgQS4gU3Vlc3MiCmRhdGU6ICJTZXB0ZW1iZXIgMTEsIDIwMTkiCm91dHB1dDoKICBodG1sX25vdGVib29rOiBkZWZhdWx0CiAgaHRtbF9kb2N1bWVudDoKICAgIGRmX3ByaW50OiBwYWdlZAogIHBkZl9kb2N1bWVudDogZGVmYXVsdAogIHdvcmRfZG9jdW1lbnQ6IGRlZmF1bHQKLS0tCgpUb2RheSB3ZSB3aWxsIGRpc2N1c3MgRXhwbG9yYXRvcnkgRGF0YSBBbmFseXNpcyAoRURBKS4KClRoaXMgaXMgdGhlIHByb2Nlc3Mgb2YgZXhwbG9yaW5nIHlvdXIgZGF0YSB1c2luZyB2aXN1YWxpemF0aW9uIGFuZCB0cmFuc2Zvcm1hdGlvbnMgYW5kIG1vZGVsaW5nICh3aWxsIGRpc2N1c3MgbW9kZWxpbmcgbW9yZSBsYXRlcikuCgoKYGBge3IgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5dmVyc2UpCmBgYAoKTGV0cyB0YWtlIGEgbG9vayBhdCB0aGUgKmRpYW1vbmRzKiBkYXRhIHNldCBhbmQgdGhlIHZhcmlhYmxlIGNhcmF0LgoKYGBge3J9CmRpYW1vbmRzCmBgYAoKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGRpYW1vbmRzKSArCiAgZ2VvbV9oaXN0b2dyYW0obWFwcGluZyA9IGFlcyh4ID0gY2FyYXQpLCBiaW53aWR0aCA9IDAuNSkKYGBgCgoKYGBge3J9CmRpYW1vbmRzICU+JSAKICBjb3VudChjdXRfd2lkdGgoY2FyYXQsIDAuNSkpCmBgYAoKTG9va2luZyBhdCB0aGUgc21hbGxlciBkaWFtb25kcy4KCmBgYHtyfQpzbWFsbGVyIDwtIGRpYW1vbmRzICU+JSAKICBmaWx0ZXIoY2FyYXQgPCAzKQogIApkaWFtb25kcyAlPiUgZ2dwbG90KG1hcHBpbmcgPSBhZXMoeCA9IGNhcmF0KSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMC4xKQpgYGAKCkxvb2sgYXQgY2FyYXQgYnkgY3V0LgoKCmBgYHtyfQpzbWFsbGVyICU+JSBnZ3Bsb3QobWFwcGluZyA9IGFlcyh4ID0gY2FyYXQsIGNvbG91ciA9IGN1dCkpICsKICBnZW9tX2ZyZXFwb2x5KGJpbndpZHRoID0gMC4xKQpgYGAKCkxvb2tpbmcgZm9yICp0eXBpY2FsIHZhbHVlcyouCgpgYGB7cn0Kc21hbGxlciAlPiUgZ2dwbG90KG1hcHBpbmcgPSBhZXMoeCA9IGNhcmF0KSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMC4wMSkKYGBgCgoKTG9va2luZyBmb3IgKnVudXN1YWwgdmFsdWVzKi4gIExldHMgbG9vayBhdCB0aGUgKnkqIHZhcmlhYmxlLgoKYGBge3J9CmRpYW1vbmRzICU+JSBnZ3Bsb3QobWFwcGluZyA9IGFlcyh4ID0geSkpICsgCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAwLjUpCmBgYAoKQXJlIHRoZXJlIG91dGxpZXJzPwoKYGBge3J9CmRpYW1vbmRzICU+JSBnZ3Bsb3QobWFwcGluZyA9IGFlcyh4ID0geSkpICsgCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAwLjUpICsKICBjb29yZF9jYXJ0ZXNpYW4oeWxpbSA9IGMoMCwgNTApKQpgYGAKCkxldHMgZmluZCB0aGUgb3V0bGllcnMuCgpgYGB7cn0KdW51c3VhbCA8LSBkaWFtb25kcyAlPiUgCiAgZmlsdGVyKHkgPCAzIHwgeSA+IDIwKSAlPiUgCiAgc2VsZWN0KHByaWNlLCB4LCB5LCB6KSAlPiUKICBhcnJhbmdlKHkpCnVudXN1YWwKYGBgCgpSZW1vdmUgb3V0bGllcnMuCgpgYGB7cn0KZGlhbW9uZHMyIDwtIGRpYW1vbmRzICU+JSAKICBmaWx0ZXIoYmV0d2Vlbih5LCAzLCAyMCkpCmBgYAoKQmV0dGVyIHRvIGNvbnZlcnQgdGhlbSB0byAqKk5BKiosIHdoaWNoIG1lYW5zIG5vdCBhdmFpbGFibGUuCgpgYGB7cn0KZGlhbW9uZHMyIDwtIGRpYW1vbmRzICU+JSAKICBtdXRhdGUoeSA9IGlmZWxzZSh5IDwgMyB8IHkgPiAyMCwgTkEsIHkpKQpgYGAKClNjYXR0ZXJwbG90cy4KCmBgYHtyfQpkaWFtb25kczIgJT4lIGdncGxvdChtYXBwaW5nID0gYWVzKHggPSB4LCB5ID0geSkpICsgCiAgZ2VvbV9wb2ludCgpCmBgYAoKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGRpYW1vbmRzMiwgbWFwcGluZyA9IGFlcyh4ID0geCwgeSA9IHkpKSArIAogIGdlb21fcG9pbnQobmEucm0gPSBUUlVFKQpgYGAKCkNhdGVnb3JpY2FsIHZhcmlhYmxlLiAgY3V0CgpgYGB7cn0KZGlhbW9uZHMgJT4lIGdncGxvdChtYXBwaW5nID0gYWVzKHggPSBjdXQpKSArIAogIGdlb21fYmFyKCkKYGBgCgoKQ29udGludW91cyB2YXJpYWJsZS4gcHJpY2UKCmBgYHtyfQpkaWFtb25kcyAlPiUgZ2dwbG90KG1hcHBpbmcgPSBhZXMoeCA9IHByaWNlLCB5ID0gLi5kZW5zaXR5Li4pKSArIAogIGdlb21fZnJlcXBvbHkobWFwcGluZyA9IGFlcyhjb2xvdXIgPSBjdXQpLCBiaW53aWR0aCA9IDUwMCkKYGBgCgpQdXR0aW5nIHRoZW0gdG9nZXRoZXIgaW4gb25lIHBsb3QuCgpgYGB7cn0KZGlhbW9uZHMgJT4lIGdncGxvdChtYXBwaW5nID0gYWVzKHggPSBjdXQsIHkgPSBwcmljZSkpICsKICBnZW9tX2JveHBsb3QoKQpgYGAKCkZvciBhIGRpZmZlcmVudCBkYXRhIHNldC4gIG1wZwoKYGBge3J9Cm1wZyAlPiUgZ2dwbG90KG1hcHBpbmcgPSBhZXMoeCA9IGNsYXNzLCB5ID0gaHd5KSkgKwogIGdlb21fYm94cGxvdCgpCmBgYAoKUmUtb3JkZXIuCgpgYGB7cn0KbXBnICU+JSBnZ3Bsb3QobWFwcGluZyA9IGFlcyh4ID0gcmVvcmRlcihjbGFzcywgaHd5LCBGVU4gPSBtZWRpYW4pLCB5ID0gaHd5KSkgKwogIGdlb21fYm94cGxvdCgpCmBgYAoKRmxpcC4KCmBgYHtyfQptcGcgJT4lIGdncGxvdChtYXBwaW5nID0gYWVzKHggPSByZW9yZGVyKGNsYXNzLCBod3ksIEZVTiA9IG1lZGlhbiksIHkgPSBod3kpKSArCiAgZ2VvbV9ib3hwbG90KCkgKwogIGNvb3JkX2ZsaXAoKQpgYGAKCg==