library(pacman)
p_load(tidyverse)
Network Science
Chapter 20: Network Science
This is some of the code from Chapter 20. I was able to get the PageRank code to work.
library(mdsr)
library(igraph)
Attaching package: 'igraph'
The following objects are masked from 'package:lubridate':
%--%, union
The following objects are masked from 'package:dplyr':
as_data_frame, groups, union
The following objects are masked from 'package:purrr':
compose, simplify
The following object is masked from 'package:tidyr':
crossing
The following object is masked from 'package:tibble':
as_data_frame
The following objects are masked from 'package:stats':
decompose, spectrum
The following object is masked from 'package:fs':
path
The following object is masked from 'package:base':
union
<- 100
n <- log(n)/n
p_star <- function(n, p, ...) {
plot_er <- erdos.renyi.game(n, p)
g plot(g, main = paste("p =", round(p, 4)), vertex.frame.color = "white", vertex.size = 3, vertex.label = NA, ...)
}
plot_er(n, p = 0.8 * p_star)
plot_er(n, p = 1.2 * p_star)
<- 1000
n <- log(n)/n
p_star <- rep(seq(from = 0, to = 2 * p_star, by = 0.001), each = 100)
ps <- function(n, p, ...) {
er_connected c(n = n, p = p, connected = is.connected(erdos.renyi.game(n, p)))
}<- as.data.frame(t(sapply(ps, er_connected, n = n)))
sims
ggplot(data = sims, aes(x = p, y = connected)) +
geom_vline(xintercept = p_star, color = "darkgray") +
geom_text(x = p_star, y = 0.9, label = "Threshold value", hjust="right") + labs(x = "Probability of edge existing",
y = "Probability that random graph is
connected") + geom_count() +
geom_smooth()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
<- erdos.renyi.game(n, p = log(n)/n)
g1 <- barabasi.game(n, m = 3, directed = FALSE)
g2
summary(g1)
IGRAPH 083eeb1 U--- 1000 3557 -- Erdos-Renyi (gnp) graph
+ attr: name (g/c), type (g/c), loops (g/l), p (g/n)
summary(g2)
IGRAPH cab8ec5 U--- 1000 2994 -- Barabasi graph
+ attr: name (g/c), power (g/n), m (g/n), zero.appeal (g/n), algorithm
| (g/c)
<- data.frame(type = rep(c("Erdos-Renyi", "Barabasi-Albert"), each = n), degree = c(degree(g1), degree(g2)))
d ggplot(data = d, aes(x = degree, color = type)) +
geom_density(size = 2) +
scale_x_continuous(limits = c(0, 25))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Warning: Removed 16 rows containing non-finite values (`stat_density()`).
Extended example: 1996 men’s college backetball
To get the example to run I downloaded the data from the kaggle website for the March Machine Learning Mania 2015. You need to have an account and your needs to log in to download the data. Note that the data file that is needed to run the example has a different name than the name in the book.
library(mdsr)
<- readr::read_csv("data/all/teams.csv") teams
Rows: 364 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): team_name
dbl (1): team_id
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
<- readr::read_csv("data/all/regular_season_compact_results.csv") %>%
games filter(season == 1996)
Rows: 134566 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): wloc
dbl (7): season, daynum, wteam, wscore, lteam, lscore, numot
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(games)
[1] 4122 8
<- games %>%
E mutate(score_ratio = wscore/lscore) %>%
select(lteam, wteam, score_ratio)
<- teams %>%
V filter(team_id %in% unique(c(E$lteam, E$wteam)))
library(igraph)
<- graph_from_data_frame(E, directed = TRUE, vertices = V)
g summary(g)
IGRAPH 2d3e1c2 DN-- 305 4122 --
+ attr: name (v/c), team_name (v/c), score_ratio (e/n)
<- set_vertex_attr(g, "pagerank", value = page_rank(g)$vector)
g as_data_frame(g, what = "vertices") %>%
arrange(desc(pagerank)) %>%
head(20)
name team_name pagerank
1203 1203 G Washington 0.021863566
1269 1269 Massachusetts 0.020504852
1207 1207 Georgetown 0.016415082
1234 1234 Iowa 0.014343587
1163 1163 Connecticut 0.014084209
1437 1437 Villanova 0.013089525
1246 1246 Kentucky 0.012735610
1345 1345 Purdue 0.011456116
1280 1280 Mississippi St 0.011368140
1210 1210 Georgia Tech 0.010578926
1112 1112 Arizona 0.010255324
1448 1448 Wake Forest 0.010081101
1242 1242 Kansas 0.009917330
1336 1336 Penn St 0.009752762
1185 1185 E Michigan 0.009707138
1393 1393 Syracuse 0.009563255
1266 1266 Marquette 0.009441184
1314 1314 North Carolina 0.009415234
1153 1153 Cincinnati 0.009396704
1396 1396 Temple 0.008595410
<- E %>%
wins group_by(wteam) %>%
summarise(N = n())
<- E %>%
losses group_by(lteam) %>%
summarise(N = n())
%>% full_join(losses, by = c("wteam" = "lteam")) %>%
wins left_join(teams, by = c("wteam" = "team_id")) %>%
rename(wins = N.x, losses = N.y) %>%
mutate(win_pct = wins / (wins + losses)) %>%
arrange(desc(win_pct)) %>%
head(20)
# A tibble: 20 × 5
wteam wins losses team_name win_pct
<dbl> <int> <int> <chr> <dbl>
1 1269 31 1 Massachusetts 0.969
2 1403 28 1 Texas Tech 0.966
3 1163 30 2 Connecticut 0.938
4 1246 28 2 Kentucky 0.933
5 1180 25 3 Drexel 0.893
6 1453 24 3 WI Green Bay 0.889
7 1158 22 3 Col Charleston 0.88
8 1307 26 4 New Mexico 0.867
9 1153 25 4 Cincinnati 0.862
10 1242 25 4 Kansas 0.862
11 1172 22 4 Davidson 0.846
12 1345 25 5 Purdue 0.833
13 1448 23 5 Wake Forest 0.821
14 1185 22 5 E Michigan 0.815
15 1439 22 5 Virginia Tech 0.815
16 1437 25 6 Villanova 0.806
17 1112 24 6 Arizona 0.8
18 1428 23 6 Utah 0.793
19 1265 22 6 Marist 0.786
20 1114 21 6 Ark Little Rock 0.778
%>% filter(wteam == 1269 & lteam == 1246) E
# A tibble: 1 × 3
lteam wteam score_ratio
<dbl> <dbl> <dbl>
1 1246 1269 1.12
%>% filter(lteam %in% c(1203, 1269) & wteam %in% c(1203, 1269)) E
# A tibble: 2 × 3
lteam wteam score_ratio
<dbl> <dbl> <dbl>
1 1269 1203 1.13
2 1203 1269 1.14
<- c("Massachusetts", "Temple", "G Washington", "Rhode Island", "St Bonaventure", "St Joseph's PA", "Virginia Tech", "Xavier", "Dayton", "Duquesne", "La Salle", "Fordhham") A_10
<- V(g)[ team_name %in% A_10 ]
a10
<- induced_subgraph(g, vids = a10)
a
<- set_vertex_attr(a, "pagerank", value = page_rank(a)$vector)
a summary(a)
IGRAPH 03a9a19 DN-- 11 90 --
+ attr: name (v/c), team_name (v/c), pagerank (v/n), score_ratio (e/n)
library(ggnetwork)
library(intergraph)
<- ggnetwork(a) a_df
Warning in format_fortify(model = model, nodes = nodes, weights = "none", :
duplicated edges detected
ggplot(a_df, aes(x, y, xend = xend, yend = yend)) +
geom_edges(aes(alpha = score_ratio), color = "lightgray", arrow = arrow(length = unit(0.2, "cm")), curvature = 0.2) +
geom_nodes(aes(size = pagerank, color = pagerank), alpha = 0.6) + geom_nodetext(aes(label = team_name)) +
scale_alpha_continuous(range = c(0.4, 1)) +
scale_size_continuous(range = c(1, 10)) +
guides(color = guide_legend("PageRank"), size=guide_legend("PageRank")) +
theme_blank()
<- t(as_adjacency_matrix(a, sparse = FALSE, attr = "score_ratio"))
P <- scale(P, center = FALSE, scale = colSums(P))
P round(P, 2)
1173 1182 1203 1247 1269 1348 1382 1386 1396 1439 1462
1173 0.00 0.09 0.00 0.10 0 0.14 0.11 0.00 0.00 0.00 0.16
1182 0.11 0.00 0.00 0.11 0 0.00 0.00 0.00 0.00 0.00 0.00
1203 0.14 0.12 0.00 0.10 1 0.14 0.11 0.17 0.33 0.27 0.16
1247 0.00 0.09 0.25 0.00 0 0.00 0.12 0.00 0.00 0.00 0.00
1269 0.14 0.09 0.26 0.12 0 0.14 0.12 0.16 0.41 0.25 0.15
1348 0.00 0.10 0.00 0.11 0 0.00 0.13 0.16 0.26 0.21 0.18
1382 0.13 0.08 0.00 0.00 0 0.14 0.00 0.00 0.00 0.00 0.00
1386 0.13 0.09 0.24 0.10 0 0.14 0.10 0.00 0.00 0.00 0.00
1396 0.14 0.15 0.00 0.13 0 0.15 0.10 0.16 0.00 0.27 0.19
1439 0.11 0.10 0.25 0.12 0 0.14 0.11 0.17 0.00 0.00 0.15
1462 0.11 0.09 0.00 0.11 0 0.00 0.12 0.18 0.00 0.00 0.00
attr(,"scaled:scale")
1173 1182 1203 1247 1269 1348 1382 1386
9.580291 12.193997 4.385632 10.705995 1.131579 7.620903 10.466131 6.573151
1396 1439 1462
4.110908 5.112315 6.894009
<- rep(1, vcount(a)) / vcount(a)
v0 v0
[1] 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909
[7] 0.09090909 0.09090909 0.09090909 0.09090909 0.09090909
<- v0
v for (i in 1:20) {
<- P %*% v
v
}
as.vector(v)
[1] 0.02620462 0.01073061 0.28880610 0.07414360 0.18416968 0.07731754
[7] 0.01477331 0.09291498 0.08133820 0.11927412 0.03032723
page_rank(a)$vect
1173 1182 1203 1247 1269 1348 1382
0.03683789 0.02134844 0.25160534 0.07006017 0.18793707 0.07779895 0.02541142
1386 1396 1439 1462
0.08807760 0.09042641 0.11095533 0.03954137
page_rank(a, damping = 1)$vec
Warning in page_rank(a, damping = 1): At core/centrality/prpack.cpp:82 :
Damping factor is 1. Damping values close to 1 may lead to numerical
instability when using PRPACK.
1173 1182 1203 1247 1269 1348
0.023832141 0.008035839 0.288239781 0.073484127 0.204591471 0.072857810
1382 1386 1396 1439 1462
0.011052515 0.091039461 0.084180455 0.115619832 0.027066568
<- v0
w <- 0.85
d for (i in 1:20) {
<- d * P %*% w + (1 - d) * v0
w
}as.vector(w)
[1] 0.04033680 0.02382045 0.25181394 0.07127669 0.16836965 0.08332106
[7] 0.02933482 0.08934652 0.08765190 0.11231744 0.04241073
page_rank(a, damping = 0.85)$vector
1173 1182 1203 1247 1269 1348 1382
0.03683789 0.02134844 0.25160534 0.07006017 0.18793707 0.07779895 0.02541142
1386 1396 1439 1462
0.08807760 0.09042641 0.11095533 0.03954137