library(tidyverse)
library(mdsr)
library(mosaicData)
Stat. 651: ggplot2
ggplot2 examples
CIACounties
Make the base plot g and then add different layers on to it.
head(CIACountries)
country pop area oil_prod gdp educ roadways net_users
1 Afghanistan 32564342 652230 0 1900 NA 0.06462444 >5%
2 Albania 3029278 28748 20510 11900 3.3 0.62613051 >35%
3 Algeria 39542166 2381741 1420000 14500 4.3 0.04771929 >15%
4 American Samoa 54343 199 0 13000 NA 1.21105528 <NA>
5 Andorra 85580 468 NA 37200 NA 0.68376068 >60%
6 Angola 19625353 1246700 1742000 7300 3.5 0.04125211 >15%
# base plot g
<- CIACountries %>% ggplot(aes(y = gdp, x = educ))
g
+ geom_point() g
Warning: Removed 64 rows containing missing values (geom_point).
+ geom_point(size = 3) g
Warning: Removed 64 rows containing missing values (geom_point).
+ geom_point(aes(color = net_users), size = 3) g
Warning: Removed 64 rows containing missing values (geom_point).
# no geom_point used for the next picture
+ geom_text( aes(label = country, color = net_users), size = 3 ) g
Warning: Removed 64 rows containing missing values (geom_text).
+ geom_point( aes(color = net_users, size = roadways) ) g
Warning: Removed 66 rows containing missing values (geom_point).
Change the scales
+ geom_point(aes(color = net_users, size = roadways)) +
g coord_trans( y = "log10")
Warning: Removed 66 rows containing missing values (geom_point).
+ geom_point(aes(color = net_users, size = roadways)) +
g scale_y_continuous(name = "Gross Domestic Product", trans = "log10")
Warning: Removed 66 rows containing missing values (geom_point).
Faceting
+ geom_point(alpha = 0.9, aes(size = roadways)) +
g coord_trans(y = "log10") +
facet_wrap( ~ net_users, nrow = 1) +
theme(legend.position = "top")
Warning: Removed 66 rows containing missing values (geom_point).
+ geom_point(alpha = 0.9, aes(size = roadways)) +
g coord_trans(y = "log10") +
scale_y_continuous(name = "Gross Domestic Product", trans = "log10") +
facet_wrap( ~ net_users, nrow = 1) +
theme(legend.position = "top")
Warning: Removed 66 rows containing missing values (geom_point).
Export the data and try in Tableau
getwd()
[1] "/home/esuess/classes/2022-2023/Fall 2022/Stat651/website/presentations/02_ggplot2"
write_csv(CIACountries, "CIACountries.csv")
MedicareCharges
Check out the MEPS website for more real data.
# head(MedicareCharges) # This now causes an error, remove the grouping.
? MedicareCharges
<- ungroup(MedicareCharges)
MedicareCharges
head(MedicareCharges)
# A tibble: 6 × 4
drg stateProvider num_charges mean_charge
<chr> <fct> <int> <dbl>
1 039 AK 1 34805.
2 039 AL 23 32044.
3 039 AR 16 27463.
4 039 AZ 24 33443.
5 039 CA 67 56095.
6 039 CO 10 35252.
<- MedicareCharges %>% filter(stateProvider == "NJ")
NJCharges NJCharges
# A tibble: 100 × 4
drg stateProvider num_charges mean_charge
<chr> <fct> <int> <dbl>
1 039 NJ 31 35104.
2 057 NJ 55 45692.
3 064 NJ 55 87042.
4 065 NJ 59 59576.
5 066 NJ 56 45819.
6 069 NJ 61 41917.
7 074 NJ 41 42993.
8 101 NJ 58 42314.
9 149 NJ 50 34916.
10 176 NJ 36 58941.
# … with 90 more rows
<- NJCharges %>% ggplot(aes(y = mean_charge, x = reorder(drg, mean_charge))) +
p geom_bar(fill = "grey", stat = "identity")
p
<- p + ylab("Statewide Average Charges ($)") +
p xlab("Medical Procedure (DRG)")
p
<- p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
p p
Now add the overall data to the plot to compare with NJ.
<- p + geom_point(data = MedicareCharges, size = 1, alpha = 0.3)
p p
SAT
Here is the link to the College Board SAT website.
<- SAT_2010 %>% ggplot(aes(x = math))
g
+ geom_histogram() g
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+ geom_histogram(binwidth = 10) g
+ geom_density() g
ggplot( data = head(SAT_2010, 10), aes( y = math, x = reorder(state, math) ) ) +
geom_bar(stat = "identity")
Scatterplot with tend lines
<- SAT_2010 %>% ggplot(aes(x = expenditure, y = math)) +
g geom_point()
g
<- g + geom_smooth(method="lm", se = 0) +
g xlab("Average expenditure per student ($100)") +
ylab("Average score on math SAT")
g
`geom_smooth()` using formula 'y ~ x'
Add the trend line within groups representing rate of taking the test.
<- SAT_2010 %>%
SAT_2010 mutate(SAT_rate = cut(sat_pct, breaks = c(0, 30, 60, 100), labels = c("low", "medium", "high") ))
<- g %+% SAT_2010
g g
`geom_smooth()` using formula 'y ~ x'
+ aes(color = SAT_rate) g
`geom_smooth()` using formula 'y ~ x'
+facet_wrap( ~ SAT_rate) g
`geom_smooth()` using formula 'y ~ x'
HELPPrct
Here is the link to the NSDUH website.
%>% ggplot(aes(x = homeless)) +
HELPrct geom_bar(aes(fill = substance), position = "fill") +
coord_flip()
NHANES
Here is the link to the NHANES website.
library(NHANES)
head(NHANES)
# A tibble: 6 × 76
ID SurveyYr Gender Age AgeDecade AgeMonths Race1 Race3 Education
<int> <fct> <fct> <int> <fct> <int> <fct> <fct> <fct>
1 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
2 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
3 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
4 51625 2009_10 male 4 " 0-9" 49 Other <NA> <NA>
5 51630 2009_10 female 49 " 40-49" 596 White <NA> Some College
6 51638 2009_10 male 9 " 0-9" 115 White <NA> <NA>
# … with 67 more variables: MaritalStatus <fct>, HHIncome <fct>,
# HHIncomeMid <int>, Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>,
# Work <fct>, Weight <dbl>, Length <dbl>, HeadCirc <dbl>, Height <dbl>,
# BMI <dbl>, BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>,
# BPSysAve <int>, BPDiaAve <int>, BPSys1 <int>, BPDia1 <int>, BPSys2 <int>,
# BPDia2 <int>, BPSys3 <int>, BPDia3 <int>, Testosterone <dbl>,
# DirectChol <dbl>, TotChol <dbl>, UrineVol1 <int>, UrineFlow1 <dbl>, …
Take a sample first and then make the plot.
sample_n(NHANES, size = 1000) %>% ggplot(aes(x = Age, y = Height, color = Gender)) +
geom_point() +
geom_smooth() +
xlab("Age (years)") +
ylab("Height (cm)")
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
Warning: Removed 31 rows containing non-finite values (stat_smooth).
Warning: Removed 31 rows containing missing values (geom_point).
Here is an alternative plot using all the data. This is hexbin plot.
%>% ggplot(aes(x = Age, y = Height, color = Gender)) +
NHANES geom_hex() +
geom_smooth() +
xlab("Age (years)") +
ylab("Height (cm)")
Warning: Removed 353 rows containing non-finite values (stat_binhex).
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Warning: Removed 353 rows containing non-finite values (stat_smooth).
library(mosaic)
Registered S3 method overwritten by 'mosaic':
method from
fortify.SpatialPolygonsDataFrame ggplot2
The 'mosaic' package masks several functions from core packages in order to add
additional features. The original behavior of these functions should not be affected by this.
Attaching package: 'mosaic'
The following object is masked from 'package:Matrix':
mean
The following objects are masked from 'package:dplyr':
count, do, tally
The following object is masked from 'package:purrr':
cross
The following object is masked from 'package:ggplot2':
stat
The following objects are masked from 'package:stats':
binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
quantile, sd, t.test, var
The following objects are masked from 'package:base':
max, mean, min, prod, range, sample, sum
head(NHANES)
# A tibble: 6 × 76
ID SurveyYr Gender Age AgeDecade AgeMonths Race1 Race3 Education
<int> <fct> <fct> <int> <fct> <int> <fct> <fct> <fct>
1 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
2 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
3 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
4 51625 2009_10 male 4 " 0-9" 49 Other <NA> <NA>
5 51630 2009_10 female 49 " 40-49" 596 White <NA> Some College
6 51638 2009_10 male 9 " 0-9" 115 White <NA> <NA>
# … with 67 more variables: MaritalStatus <fct>, HHIncome <fct>,
# HHIncomeMid <int>, Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>,
# Work <fct>, Weight <dbl>, Length <dbl>, HeadCirc <dbl>, Height <dbl>,
# BMI <dbl>, BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>,
# BPSysAve <int>, BPDiaAve <int>, BPSys1 <int>, BPDia1 <int>, BPSys2 <int>,
# BPDia2 <int>, BPSys3 <int>, BPDia3 <int>, Testosterone <dbl>,
# DirectChol <dbl>, TotChol <dbl>, UrineVol1 <int>, UrineFlow1 <dbl>, …
<- NHANES %>% select(AgeDecade, BMI_WHO)
NHANES2 head(NHANES2)
# A tibble: 6 × 2
AgeDecade BMI_WHO
<fct> <fct>
1 " 30-39" 30.0_plus
2 " 30-39" 30.0_plus
3 " 30-39" 30.0_plus
4 " 0-9" 12.0_18.5
5 " 40-49" 30.0_plus
6 " 0-9" 12.0_18.5
<- table(NHANES2)
NHANES2_table NHANES2_table
BMI_WHO
AgeDecade 12.0_18.5 18.5_to_24.9 25.0_to_29.9 30.0_plus
0-9 873 193 28 7
10-19 280 664 244 172
20-29 49 526 349 418
30-39 10 394 433 495
40-49 26 371 475 506
50-59 15 314 487 477
60-69 8 199 321 373
70+ 6 142 207 218
mosaicplot(NHANES2_table, color = TRUE)
Weather
library(macleish)
Loading required package: etl
head(whately_2015)
# A tibble: 6 × 8
when temperature wind_speed wind_dir rel_humidity pressure
<dttm> <dbl> <dbl> <dbl> <dbl> <int>
1 2015-01-01 00:00:00 -9.32 1.40 225. 54.6 985
2 2015-01-01 00:10:00 -9.46 1.51 248. 55.4 985
3 2015-01-01 00:20:00 -9.44 1.62 258. 56.2 985
4 2015-01-01 00:30:00 -9.3 1.14 244. 56.4 985
5 2015-01-01 00:40:00 -9.32 1.22 238. 56.9 984
6 2015-01-01 00:50:00 -9.34 1.09 242. 57.2 984
# … with 2 more variables: solar_radiation <dbl>, rainfall <dbl>
%>% ggplot(aes(x = when, y=temperature)) +
whately_2015 geom_line(color = "darkgrey") +
geom_smooth() +
xlab(NULL) +
ylab("Tempurature (degrees Fahrenheit)")
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Here is the link to the choroplethr website.
library(choroplethr)
Loading required package: acs
Loading required package: XML
Attaching package: 'acs'
The following object is masked from 'package:dplyr':
combine
The following object is masked from 'package:base':
apply
library(choroplethrMaps)
data(df_pop_state)
state_choropleth(df_pop_state,
title = "US 2012 State Population Estimates",
legend = "Population")
Warning in private$zoom == "alaska" || private$zoom == "hawaii": 'length(x) = 51
> 1' in coercion to 'logical(1)'
Warning in private$zoom == "alaska" || private$zoom == "hawaii": 'length(x) = 51
> 1' in coercion to 'logical(1)'
data(df_pop_county)
county_choropleth(df_pop_county,
title = "US 2012 County Population Estimates",
legend = "Population")
Warning in private$zoom == "alaska" || private$zoom == "hawaii": 'length(x) = 51
> 1' in coercion to 'logical(1)'
Warning in private$zoom == "alaska" || private$zoom == "hawaii": 'length(x) = 51
> 1' in coercion to 'logical(1)'
data(df_pop_country)
country_choropleth(df_pop_country, "2012 World Bank Populate Estimates")
Warning in self$bind(): The following regions were missing and are being set to
NA: namibia, western sahara, taiwan, antarctica, kosovo
Networks
Check out ggnetwork
Example 4.
library(ggnetwork)
library(network)
'network' 1.17.1 (2021-06-12), part of the Statnet Project
* 'news(package="network")' for changes since last version
* 'citation("network")' for citation information
* 'https://statnet.org' for help, support, and other information
library(sna)
Loading required package: statnet.common
Attaching package: 'statnet.common'
The following objects are masked from 'package:base':
attr, order
sna: Tools for Social Network Analysis
Version 2.6 created on 2020-10-5.
copyright (c) 2005, Carter T. Butts, University of California-Irvine
For citation information, type citation("sna").
Type help(package="sna") to get started.
<- network(rgraph(10, tprob = 0.2), directed = FALSE)
n
%v% "family" <- sample(letters[1:3], 10, replace = TRUE)
n %v% "importance" <- sample(1:3, 10, replace = TRUE)
n
<- network.edgecount(n)
e set.edge.attribute(n, "type", sample(letters[24:26], e, replace = TRUE))
set.edge.attribute(n, "day", sample(1:3, e, replace = TRUE))
ggnetwork(n, layout = "fruchtermanreingold", cell.jitter = 0.75)
x y family importance vertex.names xend yend day
1 0.0000000 0.3591656 a 2 7 0.1790696 0.5625896 2
3 0.3402737 0.6676026 b 3 5 0.3990675 0.2671296 3
4 0.3436387 0.4217009 c 1 8 0.6730268 0.3301077 3
5 0.3436387 0.4217009 c 1 8 0.3990675 0.2671296 2
6 0.3436387 0.4217009 c 1 8 0.0000000 0.3591656 1
7 0.3436387 0.4217009 c 1 8 0.3402737 0.6676026 1
8 0.3436387 0.4217009 c 1 8 0.5380735 0.6840422 2
9 0.3990675 0.2671296 a 1 4 0.1790696 0.5625896 1
10 0.3990675 0.2671296 a 1 4 0.6730268 0.3301077 1
11 0.5380735 0.6840422 a 1 6 0.1790696 0.5625896 1
12 0.5380735 0.6840422 a 1 6 0.3402737 0.6676026 1
13 0.5380735 0.6840422 a 1 6 0.6730268 0.3301077 1
14 0.5988985 0.0000000 a 3 9 0.3990675 0.2671296 2
15 0.5988985 0.0000000 a 3 9 0.6730268 0.3301077 1
16 0.6730268 0.3301077 c 2 3 1.0000000 0.2853717 3
17 0.7075406 1.0000000 b 1 10 0.5380735 0.6840422 2
18 1.0000000 0.2853717 a 3 1 1.0000000 0.2853717 NA
2 0.1790696 0.5625896 c 2 2 0.1790696 0.5625896 NA
31 0.6730268 0.3301077 c 2 3 0.6730268 0.3301077 NA
41 0.3990675 0.2671296 a 1 4 0.3990675 0.2671296 NA
51 0.3402737 0.6676026 b 3 5 0.3402737 0.6676026 NA
61 0.5380735 0.6840422 a 1 6 0.5380735 0.6840422 NA
71 0.0000000 0.3591656 a 2 7 0.0000000 0.3591656 NA
81 0.3436387 0.4217009 c 1 8 0.3436387 0.4217009 NA
91 0.5988985 0.0000000 a 3 9 0.5988985 0.0000000 NA
101 0.7075406 1.0000000 b 1 10 0.7075406 1.0000000 NA
type
1 y
3 x
4 z
5 x
6 x
7 z
8 y
9 x
10 z
11 z
12 y
13 x
14 x
15 x
16 z
17 y
18 <NA>
2 <NA>
31 <NA>
41 <NA>
51 <NA>
61 <NA>
71 <NA>
81 <NA>
91 <NA>
101 <NA>
ggnetwork(n, layout = "target", niter = 100)
x y family importance vertex.names xend yend
1 0.00000000 0.46498651 a 3 9 0.63884925 0.44298698
2 0.00000000 0.46498651 a 3 9 0.60812206 0.41357881
4 0.24558221 1.00000000 b 1 10 0.59395584 0.47774812
5 0.59395584 0.47774812 a 1 6 0.95382151 0.28021317
6 0.59395584 0.47774812 a 1 6 0.95933695 0.60247559
7 0.59395584 0.47774812 a 1 6 0.60812206 0.41357881
8 0.60812206 0.41357881 c 2 3 0.05122724 0.00000000
9 0.62449835 0.47004012 c 1 8 0.60812206 0.41357881
10 0.62449835 0.47004012 c 1 8 0.63884925 0.44298698
11 0.62449835 0.47004012 c 1 8 1.00000000 0.09447686
12 0.62449835 0.47004012 c 1 8 0.95933695 0.60247559
13 0.62449835 0.47004012 c 1 8 0.59395584 0.47774812
14 0.63884925 0.44298698 a 1 4 0.95382151 0.28021317
15 0.63884925 0.44298698 a 1 4 0.60812206 0.41357881
17 0.95933695 0.60247559 b 3 5 0.63884925 0.44298698
18 1.00000000 0.09447686 a 2 7 0.95382151 0.28021317
16 0.05122724 0.00000000 a 3 1 0.05122724 0.00000000
21 0.95382151 0.28021317 c 2 2 0.95382151 0.28021317
3 0.60812206 0.41357881 c 2 3 0.60812206 0.41357881
41 0.63884925 0.44298698 a 1 4 0.63884925 0.44298698
51 0.95933695 0.60247559 b 3 5 0.95933695 0.60247559
61 0.59395584 0.47774812 a 1 6 0.59395584 0.47774812
71 1.00000000 0.09447686 a 2 7 1.00000000 0.09447686
81 0.62449835 0.47004012 c 1 8 0.62449835 0.47004012
91 0.00000000 0.46498651 a 3 9 0.00000000 0.46498651
101 0.24558221 1.00000000 b 1 10 0.24558221 1.00000000
day type
1 2 x
2 1 x
4 2 y
5 1 z
6 1 y
7 1 x
8 3 z
9 3 z
10 2 x
11 1 x
12 1 z
13 2 y
14 1 x
15 1 z
17 3 x
18 2 y
16 NA <NA>
21 NA <NA>
3 NA <NA>
41 NA <NA>
51 NA <NA>
61 NA <NA>
71 NA <NA>
81 NA <NA>
91 NA <NA>
101 NA <NA>
head(ggnetwork(n))
x y family importance vertex.names xend yend day
2 0.3269990 0.7049237 c 2 3 0.0000000 0.7662902 3
3 0.4028999 1.0000000 a 3 9 0.5338945 0.6220794 2
4 0.4028999 1.0000000 a 3 9 0.3269990 0.7049237 1
5 0.4592103 0.0000000 a 2 7 0.6584518 0.2469472 2
6 0.4720168 0.3626698 c 1 8 0.3269990 0.7049237 3
7 0.4720168 0.3626698 c 1 8 0.5338945 0.6220794 2
type
2 z
3 x
4 x
5 y
6 z
7 x
tail(ggnetwork(n))
x y family importance vertex.names xend yend day
51 0.1104049 0.5085459 b 3 5 0.1104049 0.5085459 NA
61 0.3178598 0.3809046 a 1 6 0.3178598 0.3809046 NA
71 0.0000000 1.0000000 a 2 7 0.0000000 1.0000000 NA
81 0.2818331 0.7101530 c 1 8 0.2818331 0.7101530 NA
91 0.7790771 0.9340538 a 3 9 0.7790771 0.9340538 NA
101 0.3136755 0.0000000 b 1 10 0.3136755 0.0000000 NA
type
51 <NA>
61 <NA>
71 <NA>
81 <NA>
91 <NA>
101 <NA>
ggplot(n)
ggplot(ggnetwork(n))
ggplot(n, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(aes(linetype = type), color = "grey50") +
theme_blank()
ggplot(n, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(aes(linetype = type), color = "grey50", curvature = 0.1) +
theme_blank()
ggplot(n, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(aes(linetype = type), color = "grey50") +
geom_nodes(aes(color = family, size = importance)) +
theme_blank()
ggplot(n, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(color = "black") +
geom_nodes(color = "black", size = 8) +
geom_nodetext(aes(color = family, label = LETTERS[ vertex.names ]),
fontface = "bold") +
theme_blank()
Review Table 3.3 on page 47 for the different kinds of plots that can be made for different kinds of x, y variables.
Continue with the Extended example: Historical baby names on page 48.