library(tidyverse)
library(mdsr)
library(mosaicData)
Stat. 651: ggplot2
ggplot2 examples
CIACounties
Make the base plot g and then add different layers on to it.
head(CIACountries)
country pop area oil_prod gdp educ roadways net_users
1 Afghanistan 32564342 652230 0 1900 NA 0.06462444 >5%
2 Albania 3029278 28748 20510 11900 3.3 0.62613051 >35%
3 Algeria 39542166 2381741 1420000 14500 4.3 0.04771929 >15%
4 American Samoa 54343 199 0 13000 NA 1.21105528 <NA>
5 Andorra 85580 468 NA 37200 NA 0.68376068 >60%
6 Angola 19625353 1246700 1742000 7300 3.5 0.04125211 >15%
# base plot g
<- CIACountries |> ggplot(aes(y = gdp, x = educ))
g
+ geom_point() g
Warning: Removed 64 rows containing missing values (`geom_point()`).
+ geom_point(size = 3) g
Warning: Removed 64 rows containing missing values (`geom_point()`).
+ geom_point(aes(color = net_users), size = 3) g
Warning: Removed 64 rows containing missing values (`geom_point()`).
# no geom_point used for the next picture
+ geom_text( aes(label = country, color = net_users), size = 3 ) g
Warning: Removed 64 rows containing missing values (`geom_text()`).
+ geom_point( aes(color = net_users, size = roadways) ) g
Warning: Removed 66 rows containing missing values (`geom_point()`).
Change the scales
+ geom_point(aes(color = net_users, size = roadways)) +
g coord_trans( y = "log10")
Warning: Removed 66 rows containing missing values (`geom_point()`).
+
g geom_point(aes(color = net_users, size = roadways)) +
scale_y_continuous(
name = "Gross Domestic Product",
trans = "log10",
labels = scales::comma
)
Warning: Removed 66 rows containing missing values (`geom_point()`).
Faceting
+ geom_point(alpha = 0.9, aes(size = roadways)) +
g coord_trans(y = "log10") +
facet_wrap( ~ net_users, nrow = 1) +
theme(legend.position = "top")
Warning: Removed 66 rows containing missing values (`geom_point()`).
+ geom_point(alpha = 0.9, aes(size = roadways)) +
g coord_trans(y = "log10") +
scale_y_continuous(name = "Gross Domestic Product", trans = "log10") +
facet_wrap( ~ net_users, nrow = 1) +
theme(legend.position = "top")
Warning: Removed 66 rows containing missing values (`geom_point()`).
Export the data and try in Tableau
getwd()
[1] "/home/esuess/classes/2023-2024/Fall 2023/Stat651/website/_presentations/02_ggplot2"
write_csv(CIACountries, "CIACountries.csv")
MedicareCharges
Check out the MEPS website for more real data.
# head(MedicareCharges) # This now causes an error, remove the grouping.
? MedicareCharges
<- MedicareCharges |>
ChargesNJ filter(stateProvider == "NJ")
<- ggplot(
p data = ChargesNJ,
aes(x = reorder(drg, mean_charge), y = mean_charge)
+
) geom_col(fill = "gray") +
ylab("Statewide Average Charges ($)") +
xlab("Medical Procedure (DRG)") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = rel(0.5)))
p
Now add the overall data to the plot to compare with NJ.
<- p + geom_point(data = MedicareCharges, size = 1, alpha = 0.3)
p p
SAT
Here is the link to the College Board SAT website.
<- SAT_2010 |> ggplot(aes(x = math))
g
+ geom_histogram() g
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+ geom_histogram(binwidth = 10) + labs(x = "Average math SAT score") g
+ geom_density(adjust = 0.3) g
ggplot(
data = head(SAT_2010, 10),
aes(x = reorder(state, math), y = math)
+
) geom_col() +
labs(x = "State", y = "Average math SAT score")
HELPPrct
Here is the link to the NSDUH website.
ggplot(data = mosaicData::HELPrct, aes(x = homeless)) +
geom_bar(aes(fill = substance), position = "fill") +
scale_fill_brewer(palette = "Spectral") +
coord_flip()
Scatterplot with tend lines
<- ggplot(
g data = SAT_2010,
aes(x = expenditure, y = math)
+
) geom_point()
g
<- g + geom_smooth(method="lm", se = 0) +
g xlab("Average expenditure per student ($100)") +
ylab("Average score on math SAT")
g
`geom_smooth()` using formula = 'y ~ x'
Add the trend line within groups representing rate of taking the test.
<- SAT_2010|>
SAT_2010 mutate(
SAT_rate = cut(
sat_pct, breaks = c(0, 30, 60, 100),
labels = c("low", "medium", "high")
)
)
<- g %+% SAT_2010
g g
`geom_smooth()` using formula = 'y ~ x'
+ aes(color = SAT_rate) g
`geom_smooth()` using formula = 'y ~ x'
+ facet_wrap( ~ SAT_rate) g
`geom_smooth()` using formula = 'y ~ x'
NHANES
Here is the link to the NHANES website.
library(NHANES)
head(NHANES)
# A tibble: 6 × 76
ID SurveyYr Gender Age AgeDecade AgeMonths Race1 Race3 Education
<int> <fct> <fct> <int> <fct> <int> <fct> <fct> <fct>
1 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
2 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
3 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
4 51625 2009_10 male 4 " 0-9" 49 Other <NA> <NA>
5 51630 2009_10 female 49 " 40-49" 596 White <NA> Some College
6 51638 2009_10 male 9 " 0-9" 115 White <NA> <NA>
# ℹ 67 more variables: MaritalStatus <fct>, HHIncome <fct>, HHIncomeMid <int>,
# Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>, Work <fct>, Weight <dbl>,
# Length <dbl>, HeadCirc <dbl>, Height <dbl>, BMI <dbl>,
# BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>, BPSysAve <int>,
# BPDiaAve <int>, BPSys1 <int>, BPDia1 <int>, BPSys2 <int>, BPDia2 <int>,
# BPSys3 <int>, BPDia3 <int>, Testosterone <dbl>, DirectChol <dbl>,
# TotChol <dbl>, UrineVol1 <int>, UrineFlow1 <dbl>, UrineVol2 <int>, …
Take a sample first and then make the plot.
library(NHANES)
ggplot(
data = slice_sample(NHANES, n = 1000),
aes(x = Age, y = Height, color = fct_relevel(Gender, "male"))
+
) geom_point() +
geom_smooth() +
xlab("Age (years)") +
ylab("Height (cm)") +
labs(color = "Gender")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 34 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 34 rows containing missing values (`geom_point()`).
Here is an alternative plot using all the data. This is hexbin plot.
|> ggplot(aes(x = Age, y = Height, color = Gender)) +
NHANES geom_hex() +
geom_smooth() +
xlab("Age (years)") +
ylab("Height (cm)")
Warning: Removed 353 rows containing non-finite values (`stat_binhex()`).
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 353 rows containing non-finite values (`stat_smooth()`).
library(mosaic)
Registered S3 method overwritten by 'mosaic':
method from
fortify.SpatialPolygonsDataFrame ggplot2
The 'mosaic' package masks several functions from core packages in order to add
additional features. The original behavior of these functions should not be affected by this.
Attaching package: 'mosaic'
The following object is masked from 'package:Matrix':
mean
The following objects are masked from 'package:dplyr':
count, do, tally
The following object is masked from 'package:purrr':
cross
The following object is masked from 'package:ggplot2':
stat
The following objects are masked from 'package:stats':
binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
quantile, sd, t.test, var
The following objects are masked from 'package:base':
max, mean, min, prod, range, sample, sum
head(NHANES)
# A tibble: 6 × 76
ID SurveyYr Gender Age AgeDecade AgeMonths Race1 Race3 Education
<int> <fct> <fct> <int> <fct> <int> <fct> <fct> <fct>
1 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
2 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
3 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
4 51625 2009_10 male 4 " 0-9" 49 Other <NA> <NA>
5 51630 2009_10 female 49 " 40-49" 596 White <NA> Some College
6 51638 2009_10 male 9 " 0-9" 115 White <NA> <NA>
# ℹ 67 more variables: MaritalStatus <fct>, HHIncome <fct>, HHIncomeMid <int>,
# Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>, Work <fct>, Weight <dbl>,
# Length <dbl>, HeadCirc <dbl>, Height <dbl>, BMI <dbl>,
# BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>, BPSysAve <int>,
# BPDiaAve <int>, BPSys1 <int>, BPDia1 <int>, BPSys2 <int>, BPDia2 <int>,
# BPSys3 <int>, BPDia3 <int>, Testosterone <dbl>, DirectChol <dbl>,
# TotChol <dbl>, UrineVol1 <int>, UrineFlow1 <dbl>, UrineVol2 <int>, …
<- NHANES |> select(AgeDecade, BMI_WHO)
NHANES2 head(NHANES2)
# A tibble: 6 × 2
AgeDecade BMI_WHO
<fct> <fct>
1 " 30-39" 30.0_plus
2 " 30-39" 30.0_plus
3 " 30-39" 30.0_plus
4 " 0-9" 12.0_18.5
5 " 40-49" 30.0_plus
6 " 0-9" 12.0_18.5
<- table(NHANES2)
NHANES2_table NHANES2_table
BMI_WHO
AgeDecade 12.0_18.5 18.5_to_24.9 25.0_to_29.9 30.0_plus
0-9 873 193 28 7
10-19 280 664 244 172
20-29 49 526 349 418
30-39 10 394 433 495
40-49 26 371 475 506
50-59 15 314 487 477
60-69 8 199 321 373
70+ 6 142 207 218
mosaicplot(NHANES2_table, color = TRUE)
Weather
library(macleish)
Loading required package: etl
head(whately_2015)
# A tibble: 6 × 8
when temperature wind_speed wind_dir rel_humidity pressure
<dttm> <dbl> <dbl> <dbl> <dbl> <int>
1 2015-01-01 00:00:00 -9.32 1.40 225. 54.6 985
2 2015-01-01 00:10:00 -9.46 1.51 248. 55.4 985
3 2015-01-01 00:20:00 -9.44 1.62 258. 56.2 985
4 2015-01-01 00:30:00 -9.3 1.14 244. 56.4 985
5 2015-01-01 00:40:00 -9.32 1.22 238. 56.9 984
6 2015-01-01 00:50:00 -9.34 1.09 242. 57.2 984
# ℹ 2 more variables: solar_radiation <dbl>, rainfall <dbl>
|> ggplot(aes(x = when, y=temperature)) +
whately_2015 geom_line(color = "darkgrey") +
geom_smooth() +
xlab(NULL) +
ylab("Tempurature (degrees Fahrenheit)")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Here is the link to the choroplethr website.
library(choroplethr)
Loading required package: acs
Loading required package: XML
Attaching package: 'acs'
The following object is masked from 'package:dplyr':
combine
The following object is masked from 'package:base':
apply
The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
which was just loaded, were retired in October 2023.
Please refer to R-spatial evolution reports for details, especially
https://r-spatial.org/r/2023/05/15/evolution4.html.
It may be desirable to make the sf package available;
package maintainers should consider adding sf to Suggests:.
library(choroplethrMaps)
data(df_pop_state)
state_choropleth(df_pop_state,
title = "US 2012 State Population Estimates",
legend = "Population")
data(df_pop_county)
county_choropleth(df_pop_county,
title = "US 2012 County Population Estimates",
legend = "Population")
data(df_pop_country)
country_choropleth(df_pop_country, "2012 World Bank Populate Estimates")
Warning in self$bind(): The following regions were missing and are being set to
NA: namibia, western sahara, taiwan, antarctica, kosovo
Review Table 3.3 on page 47 for the different kinds of plots that can be made for different kinds of x, y variables.
Continue with the Extended example: Historical baby names on page 48.