Today we will discuss Exploratory Data Analysis (EDA).
This is the process of exploring your data using visualization and transformations and modeling (will discuss modeling more later).
library(tidyverse)
Lets take a look at the diamonds data set and the variable carat.
diamonds
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
diamonds %>%
count(cut_width(carat, 0.5))
Looking at the smaller diamonds.
smaller <- diamonds %>%
filter(carat < 3)
diamonds %>% ggplot(mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.1)
Look at carat by cut.
smaller %>% ggplot(mapping = aes(x = carat, colour = cut)) +
geom_freqpoly(binwidth = 0.1)
Looking for typical values.
smaller %>% ggplot(mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.01)
Looking for unusual values. Lets look at the y variable.
diamonds %>% ggplot(mapping = aes(x = y)) +
geom_histogram(binwidth = 0.5)
Are there outliers?
diamonds %>% ggplot(mapping = aes(x = y)) +
geom_histogram(binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))
Lets find the outliers.
unusual <- diamonds %>%
filter(y < 3 | y > 20) %>%
select(price, x, y, z) %>%
arrange(y)
unusual
Remove outliers.
diamonds2 <- diamonds %>%
filter(between(y, 3, 20))
Better to convert them to NA, which means not available.
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
Scatterplots.
diamonds2 %>% ggplot(mapping = aes(x = x, y = y)) +
geom_point()
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point(na.rm = TRUE)
Categorical variable. cut
diamonds %>% ggplot(mapping = aes(x = cut)) +
geom_bar()
Continuous variable. price
diamonds %>% ggplot(mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
Putting them together in one plot.
diamonds %>% ggplot(mapping = aes(x = cut, y = price)) +
geom_boxplot()
For a different data set. mpg
mpg %>% ggplot(mapping = aes(x = class, y = hwy)) +
geom_boxplot()
Re-order.
mpg %>% ggplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
geom_boxplot()
Flip.
mpg %>% ggplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
geom_boxplot() +
coord_flip()
LS0tCnRpdGxlOiAiRXhwbG9yYXRvcnlEYXRhQW5hbHlzaXMiCmF1dGhvcjogIlByb2YuIEVyaWMgQS4gU3Vlc3MiCmRhdGU6ICJTZXB0ZW1iZXIgMTEsIDIwMTkiCm91dHB1dDoKICBodG1sX25vdGVib29rOiBkZWZhdWx0CiAgaHRtbF9kb2N1bWVudDoKICAgIGRmX3ByaW50OiBwYWdlZAogIHBkZl9kb2N1bWVudDogZGVmYXVsdAogIHdvcmRfZG9jdW1lbnQ6IGRlZmF1bHQKLS0tCgpUb2RheSB3ZSB3aWxsIGRpc2N1c3MgRXhwbG9yYXRvcnkgRGF0YSBBbmFseXNpcyAoRURBKS4KClRoaXMgaXMgdGhlIHByb2Nlc3Mgb2YgZXhwbG9yaW5nIHlvdXIgZGF0YSB1c2luZyB2aXN1YWxpemF0aW9uIGFuZCB0cmFuc2Zvcm1hdGlvbnMgYW5kIG1vZGVsaW5nICh3aWxsIGRpc2N1c3MgbW9kZWxpbmcgbW9yZSBsYXRlcikuCgoKYGBge3IgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5dmVyc2UpCmBgYAoKTGV0cyB0YWtlIGEgbG9vayBhdCB0aGUgKmRpYW1vbmRzKiBkYXRhIHNldCBhbmQgdGhlIHZhcmlhYmxlIGNhcmF0LgoKYGBge3J9CmRpYW1vbmRzCmBgYAoKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGRpYW1vbmRzKSArCiAgZ2VvbV9oaXN0b2dyYW0obWFwcGluZyA9IGFlcyh4ID0gY2FyYXQpLCBiaW53aWR0aCA9IDAuNSkKYGBgCgoKYGBge3J9CmRpYW1vbmRzICU+JSAKICBjb3VudChjdXRfd2lkdGgoY2FyYXQsIDAuNSkpCmBgYAoKTG9va2luZyBhdCB0aGUgc21hbGxlciBkaWFtb25kcy4KCmBgYHtyfQpzbWFsbGVyIDwtIGRpYW1vbmRzICU+JSAKICBmaWx0ZXIoY2FyYXQgPCAzKQogIApkaWFtb25kcyAlPiUgZ2dwbG90KG1hcHBpbmcgPSBhZXMoeCA9IGNhcmF0KSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMC4xKQpgYGAKCkxvb2sgYXQgY2FyYXQgYnkgY3V0LgoKCmBgYHtyfQpzbWFsbGVyICU+JSBnZ3Bsb3QobWFwcGluZyA9IGFlcyh4ID0gY2FyYXQsIGNvbG91ciA9IGN1dCkpICsKICBnZW9tX2ZyZXFwb2x5KGJpbndpZHRoID0gMC4xKQpgYGAKCkxvb2tpbmcgZm9yICp0eXBpY2FsIHZhbHVlcyouCgpgYGB7cn0Kc21hbGxlciAlPiUgZ2dwbG90KG1hcHBpbmcgPSBhZXMoeCA9IGNhcmF0KSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMC4wMSkKYGBgCgoKTG9va2luZyBmb3IgKnVudXN1YWwgdmFsdWVzKi4gIExldHMgbG9vayBhdCB0aGUgKnkqIHZhcmlhYmxlLgoKYGBge3J9CmRpYW1vbmRzICU+JSBnZ3Bsb3QobWFwcGluZyA9IGFlcyh4ID0geSkpICsgCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAwLjUpCmBgYAoKQXJlIHRoZXJlIG91dGxpZXJzPwoKYGBge3J9CmRpYW1vbmRzICU+JSBnZ3Bsb3QobWFwcGluZyA9IGFlcyh4ID0geSkpICsgCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAwLjUpICsKICBjb29yZF9jYXJ0ZXNpYW4oeWxpbSA9IGMoMCwgNTApKQpgYGAKCkxldHMgZmluZCB0aGUgb3V0bGllcnMuCgpgYGB7cn0KdW51c3VhbCA8LSBkaWFtb25kcyAlPiUgCiAgZmlsdGVyKHkgPCAzIHwgeSA+IDIwKSAlPiUgCiAgc2VsZWN0KHByaWNlLCB4LCB5LCB6KSAlPiUKICBhcnJhbmdlKHkpCnVudXN1YWwKYGBgCgpSZW1vdmUgb3V0bGllcnMuCgpgYGB7cn0KZGlhbW9uZHMyIDwtIGRpYW1vbmRzICU+JSAKICBmaWx0ZXIoYmV0d2Vlbih5LCAzLCAyMCkpCmBgYAoKQmV0dGVyIHRvIGNvbnZlcnQgdGhlbSB0byAqKk5BKiosIHdoaWNoIG1lYW5zIG5vdCBhdmFpbGFibGUuCgpgYGB7cn0KZGlhbW9uZHMyIDwtIGRpYW1vbmRzICU+JSAKICBtdXRhdGUoeSA9IGlmZWxzZSh5IDwgMyB8IHkgPiAyMCwgTkEsIHkpKQpgYGAKClNjYXR0ZXJwbG90cy4KCmBgYHtyfQpkaWFtb25kczIgJT4lIGdncGxvdChtYXBwaW5nID0gYWVzKHggPSB4LCB5ID0geSkpICsgCiAgZ2VvbV9wb2ludCgpCmBgYAoKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGRpYW1vbmRzMiwgbWFwcGluZyA9IGFlcyh4ID0geCwgeSA9IHkpKSArIAogIGdlb21fcG9pbnQobmEucm0gPSBUUlVFKQpgYGAKCkNhdGVnb3JpY2FsIHZhcmlhYmxlLiAgY3V0CgpgYGB7cn0KZGlhbW9uZHMgJT4lIGdncGxvdChtYXBwaW5nID0gYWVzKHggPSBjdXQpKSArIAogIGdlb21fYmFyKCkKYGBgCgoKQ29udGludW91cyB2YXJpYWJsZS4gcHJpY2UKCmBgYHtyfQpkaWFtb25kcyAlPiUgZ2dwbG90KG1hcHBpbmcgPSBhZXMoeCA9IHByaWNlLCB5ID0gLi5kZW5zaXR5Li4pKSArIAogIGdlb21fZnJlcXBvbHkobWFwcGluZyA9IGFlcyhjb2xvdXIgPSBjdXQpLCBiaW53aWR0aCA9IDUwMCkKYGBgCgpQdXR0aW5nIHRoZW0gdG9nZXRoZXIgaW4gb25lIHBsb3QuCgpgYGB7cn0KZGlhbW9uZHMgJT4lIGdncGxvdChtYXBwaW5nID0gYWVzKHggPSBjdXQsIHkgPSBwcmljZSkpICsKICBnZW9tX2JveHBsb3QoKQpgYGAKCkZvciBhIGRpZmZlcmVudCBkYXRhIHNldC4gIG1wZwoKYGBge3J9Cm1wZyAlPiUgZ2dwbG90KG1hcHBpbmcgPSBhZXMoeCA9IGNsYXNzLCB5ID0gaHd5KSkgKwogIGdlb21fYm94cGxvdCgpCmBgYAoKUmUtb3JkZXIuCgpgYGB7cn0KbXBnICU+JSBnZ3Bsb3QobWFwcGluZyA9IGFlcyh4ID0gcmVvcmRlcihjbGFzcywgaHd5LCBGVU4gPSBtZWRpYW4pLCB5ID0gaHd5KSkgKwogIGdlb21fYm94cGxvdCgpCmBgYAoKRmxpcC4KCmBgYHtyfQptcGcgJT4lIGdncGxvdChtYXBwaW5nID0gYWVzKHggPSByZW9yZGVyKGNsYXNzLCBod3ksIEZVTiA9IG1lZGlhbiksIHkgPSBod3kpKSArCiAgZ2VvbV9ib3hwbG90KCkgKwogIGNvb3JkX2ZsaXAoKQpgYGAKCg==