Stat. 651: ggplot2

Author

Prof. Eric A. Suess

ggplot2 examples

library(tidyverse)
library(mdsr)
library(mosaicData)

CIACounties

Make the base plot g and then add different layers on to it.

head(CIACountries)
         country      pop    area oil_prod   gdp educ   roadways net_users
1    Afghanistan 32564342  652230        0  1900   NA 0.06462444       >5%
2        Albania  3029278   28748    20510 11900  3.3 0.62613051      >35%
3        Algeria 39542166 2381741  1420000 14500  4.3 0.04771929      >15%
4 American Samoa    54343     199        0 13000   NA 1.21105528      <NA>
5        Andorra    85580     468       NA 37200   NA 0.68376068      >60%
6         Angola 19625353 1246700  1742000  7300  3.5 0.04125211      >15%
# base plot g

g <- CIACountries |>  ggplot(aes(y = gdp, x = educ)) 

g + geom_point()
Warning: Removed 64 rows containing missing values (`geom_point()`).

g + geom_point(size = 3)
Warning: Removed 64 rows containing missing values (`geom_point()`).

g + geom_point(aes(color = net_users), size = 3)
Warning: Removed 64 rows containing missing values (`geom_point()`).

# no geom_point used for the next picture

g + geom_text( aes(label = country, color = net_users), size = 3 )
Warning: Removed 64 rows containing missing values (`geom_text()`).

g + geom_point( aes(color = net_users, size = roadways) )
Warning: Removed 66 rows containing missing values (`geom_point()`).

Change the scales

g + geom_point(aes(color = net_users, size = roadways)) +
  coord_trans( y = "log10")
Warning: Removed 66 rows containing missing values (`geom_point()`).

g + 
  geom_point(aes(color = net_users, size = roadways)) + 
  scale_y_continuous(
    name = "Gross Domestic Product", 
    trans = "log10",
    labels = scales::comma
  )
Warning: Removed 66 rows containing missing values (`geom_point()`).

Faceting

g + geom_point(alpha = 0.9, aes(size = roadways)) +
  coord_trans(y = "log10") +
  facet_wrap( ~ net_users, nrow = 1) +
  theme(legend.position = "top")
Warning: Removed 66 rows containing missing values (`geom_point()`).

g + geom_point(alpha = 0.9, aes(size = roadways)) +
  coord_trans(y = "log10") +
  scale_y_continuous(name = "Gross Domestic Product", trans = "log10") +
  facet_wrap( ~ net_users, nrow = 1) +
  theme(legend.position = "top")
Warning: Removed 66 rows containing missing values (`geom_point()`).

Export the data and try in Tableau

getwd()
[1] "/home/esuess/classes/2023-2024/Fall 2023/Stat651/website/_presentations/02_ggplot2"
write_csv(CIACountries, "CIACountries.csv")

MedicareCharges

Check out the MEPS website for more real data.

# head(MedicareCharges) # This now causes an error, remove the grouping.

? MedicareCharges

ChargesNJ <- MedicareCharges |> 
  filter(stateProvider == "NJ")
p <- ggplot(
  data = ChargesNJ, 
  aes(x = reorder(drg, mean_charge), y = mean_charge)
) +
  geom_col(fill = "gray") +
  ylab("Statewide Average Charges ($)") + 
  xlab("Medical Procedure (DRG)") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = rel(0.5)))
p

Now add the overall data to the plot to compare with NJ.

p <- p + geom_point(data = MedicareCharges, size = 1, alpha = 0.3) 
p

SAT

Here is the link to the College Board SAT website.

g <- SAT_2010 |> ggplot(aes(x = math))

g + geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

g + geom_histogram(binwidth = 10) + labs(x = "Average math SAT score")

g + geom_density(adjust = 0.3)

ggplot(
  data = head(SAT_2010, 10), 
  aes(x = reorder(state, math), y = math)
) +
  geom_col() +
  labs(x = "State", y = "Average math SAT score")

HELPPrct

Here is the link to the NSDUH website.

ggplot(data = mosaicData::HELPrct, aes(x = homeless)) + 
  geom_bar(aes(fill = substance), position = "fill") +
  scale_fill_brewer(palette = "Spectral") + 
  coord_flip()

Scatterplot with tend lines

g <- ggplot(
  data = SAT_2010, 
  aes(x = expenditure, y = math)
) + 
  geom_point()
g

g <- g + geom_smooth(method="lm", se = 0) +
  xlab("Average expenditure per student ($100)") +
  ylab("Average score on math SAT")
g
`geom_smooth()` using formula = 'y ~ x'

Add the trend line within groups representing rate of taking the test.

SAT_2010 <- SAT_2010|> 
  mutate(
    SAT_rate = cut(
      sat_pct, 
      breaks = c(0, 30, 60, 100), 
      labels = c("low", "medium", "high")
    )
  )

g <- g %+% SAT_2010
g
`geom_smooth()` using formula = 'y ~ x'

g + aes(color = SAT_rate)
`geom_smooth()` using formula = 'y ~ x'

g + facet_wrap( ~ SAT_rate)
`geom_smooth()` using formula = 'y ~ x'

NHANES

Here is the link to the NHANES website.

library(NHANES)

head(NHANES)
# A tibble: 6 × 76
     ID SurveyYr Gender   Age AgeDecade AgeMonths Race1 Race3 Education   
  <int> <fct>    <fct>  <int> <fct>         <int> <fct> <fct> <fct>       
1 51624 2009_10  male      34 " 30-39"        409 White <NA>  High School 
2 51624 2009_10  male      34 " 30-39"        409 White <NA>  High School 
3 51624 2009_10  male      34 " 30-39"        409 White <NA>  High School 
4 51625 2009_10  male       4 " 0-9"           49 Other <NA>  <NA>        
5 51630 2009_10  female    49 " 40-49"        596 White <NA>  Some College
6 51638 2009_10  male       9 " 0-9"          115 White <NA>  <NA>        
# ℹ 67 more variables: MaritalStatus <fct>, HHIncome <fct>, HHIncomeMid <int>,
#   Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>, Work <fct>, Weight <dbl>,
#   Length <dbl>, HeadCirc <dbl>, Height <dbl>, BMI <dbl>,
#   BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>, BPSysAve <int>,
#   BPDiaAve <int>, BPSys1 <int>, BPDia1 <int>, BPSys2 <int>, BPDia2 <int>,
#   BPSys3 <int>, BPDia3 <int>, Testosterone <dbl>, DirectChol <dbl>,
#   TotChol <dbl>, UrineVol1 <int>, UrineFlow1 <dbl>, UrineVol2 <int>, …

Take a sample first and then make the plot.

library(NHANES)
ggplot(
  data = slice_sample(NHANES, n = 1000), 
  aes(x = Age, y = Height, color = fct_relevel(Gender, "male"))
) + 
  geom_point() + 
  geom_smooth() + 
  xlab("Age (years)") + 
  ylab("Height (cm)") +
  labs(color = "Gender")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 34 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 34 rows containing missing values (`geom_point()`).

Here is an alternative plot using all the data. This is hexbin plot.

NHANES |> ggplot(aes(x = Age, y = Height, color = Gender)) +
  geom_hex() +
  geom_smooth() +
  xlab("Age (years)") +
  ylab("Height (cm)")
Warning: Removed 353 rows containing non-finite values (`stat_binhex()`).
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 353 rows containing non-finite values (`stat_smooth()`).

library(mosaic)
Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2

The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.

Attaching package: 'mosaic'
The following object is masked from 'package:Matrix':

    mean
The following objects are masked from 'package:dplyr':

    count, do, tally
The following object is masked from 'package:purrr':

    cross
The following object is masked from 'package:ggplot2':

    stat
The following objects are masked from 'package:stats':

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var
The following objects are masked from 'package:base':

    max, mean, min, prod, range, sample, sum
head(NHANES)
# A tibble: 6 × 76
     ID SurveyYr Gender   Age AgeDecade AgeMonths Race1 Race3 Education   
  <int> <fct>    <fct>  <int> <fct>         <int> <fct> <fct> <fct>       
1 51624 2009_10  male      34 " 30-39"        409 White <NA>  High School 
2 51624 2009_10  male      34 " 30-39"        409 White <NA>  High School 
3 51624 2009_10  male      34 " 30-39"        409 White <NA>  High School 
4 51625 2009_10  male       4 " 0-9"           49 Other <NA>  <NA>        
5 51630 2009_10  female    49 " 40-49"        596 White <NA>  Some College
6 51638 2009_10  male       9 " 0-9"          115 White <NA>  <NA>        
# ℹ 67 more variables: MaritalStatus <fct>, HHIncome <fct>, HHIncomeMid <int>,
#   Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>, Work <fct>, Weight <dbl>,
#   Length <dbl>, HeadCirc <dbl>, Height <dbl>, BMI <dbl>,
#   BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>, BPSysAve <int>,
#   BPDiaAve <int>, BPSys1 <int>, BPDia1 <int>, BPSys2 <int>, BPDia2 <int>,
#   BPSys3 <int>, BPDia3 <int>, Testosterone <dbl>, DirectChol <dbl>,
#   TotChol <dbl>, UrineVol1 <int>, UrineFlow1 <dbl>, UrineVol2 <int>, …
NHANES2 <- NHANES |> select(AgeDecade, BMI_WHO) 
head(NHANES2)
# A tibble: 6 × 2
  AgeDecade BMI_WHO  
  <fct>     <fct>    
1 " 30-39"  30.0_plus
2 " 30-39"  30.0_plus
3 " 30-39"  30.0_plus
4 " 0-9"    12.0_18.5
5 " 40-49"  30.0_plus
6 " 0-9"    12.0_18.5
NHANES2_table <- table(NHANES2)
NHANES2_table
         BMI_WHO
AgeDecade 12.0_18.5 18.5_to_24.9 25.0_to_29.9 30.0_plus
    0-9         873          193           28         7
    10-19       280          664          244       172
    20-29        49          526          349       418
    30-39        10          394          433       495
    40-49        26          371          475       506
    50-59        15          314          487       477
    60-69         8          199          321       373
    70+           6          142          207       218
mosaicplot(NHANES2_table, color = TRUE)

Weather

library(macleish)
Loading required package: etl
head(whately_2015)
# A tibble: 6 × 8
  when                temperature wind_speed wind_dir rel_humidity pressure
  <dttm>                    <dbl>      <dbl>    <dbl>        <dbl>    <int>
1 2015-01-01 00:00:00       -9.32       1.40     225.         54.6      985
2 2015-01-01 00:10:00       -9.46       1.51     248.         55.4      985
3 2015-01-01 00:20:00       -9.44       1.62     258.         56.2      985
4 2015-01-01 00:30:00       -9.3        1.14     244.         56.4      985
5 2015-01-01 00:40:00       -9.32       1.22     238.         56.9      984
6 2015-01-01 00:50:00       -9.34       1.09     242.         57.2      984
# ℹ 2 more variables: solar_radiation <dbl>, rainfall <dbl>
whately_2015 |> ggplot(aes(x = when, y=temperature)) +
  geom_line(color = "darkgrey") +
  geom_smooth() +
  xlab(NULL) +
  ylab("Tempurature (degrees Fahrenheit)")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Here is the link to the choroplethr website.

library(choroplethr)
Loading required package: acs
Loading required package: XML

Attaching package: 'acs'
The following object is masked from 'package:dplyr':

    combine
The following object is masked from 'package:base':

    apply
The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
which was just loaded, were retired in October 2023.
Please refer to R-spatial evolution reports for details, especially
https://r-spatial.org/r/2023/05/15/evolution4.html.
It may be desirable to make the sf package available;
package maintainers should consider adding sf to Suggests:.
library(choroplethrMaps)

    
data(df_pop_state)
state_choropleth(df_pop_state, 
                 title  = "US 2012 State Population Estimates", 
                 legend = "Population")

data(df_pop_county)
county_choropleth(df_pop_county, 
                  title  = "US 2012 County Population Estimates", 
                  legend = "Population")

data(df_pop_country)
country_choropleth(df_pop_country, "2012 World Bank Populate Estimates")
Warning in self$bind(): The following regions were missing and are being set to
NA: namibia, western sahara, taiwan, antarctica, kosovo

Review Table 3.3 on page 47 for the different kinds of plots that can be made for different kinds of x, y variables.

Continue with the Extended example: Historical baby names on page 48.