This is R code from Modern Data Science with R, Chapter 15 Text as data.
In Section 15.2 Analyzing textual data there is an example where research papers related to Data Science are downloaded from aRxiv and summarized.
library(tidyverse)
library(mdsr)
library(aRxiv)
DataSciencePapers <- arxiv_search(query = '"Data Science"', limit = 200)
retrieved batch 1
retrieved batch 2
head(DataSciencePapers)
library(lubridate)
DataSciencePapers <- DataSciencePapers %>%
mutate(submitted = ymd_hms(submitted), updated = ymd_hms(updated))
glimpse(DataSciencePapers)
Observations: 200
Variables: 15
$ id <chr> "astro-ph/0701361v1", "0901.2805v1", "0901.3118v2", "0909.3895v1", "1...
$ submitted <dttm> 2007-01-12 03:28:11, 2009-01-19 10:38:33, 2009-01-20 18:48:59, 2009-...
$ updated <dttm> 2007-01-12 03:28:11, 2009-01-19 10:38:33, 2009-01-24 19:23:47, 2009-...
$ title <chr> "How to Make the Dream Come True: The Astronomers' Data Manifesto", "...
$ abstract <chr> " Astronomy is one of the most data-intensive of the sciences. Data ...
$ authors <chr> "Ray P Norris", "Heinz Andernach", "O. V. Verkhodanov|S. A. Trushkin|...
$ affiliations <chr> "", "", "Special Astrophysical Observatory, Nizhnij Arkhyz, Karachaj-...
$ link_abstract <chr> "http://arxiv.org/abs/astro-ph/0701361v1", "http://arxiv.org/abs/0901...
$ link_pdf <chr> "http://arxiv.org/pdf/astro-ph/0701361v1", "http://arxiv.org/pdf/0901...
$ link_doi <chr> "", "http://dx.doi.org/10.2481/dsj.8.41", "http://dx.doi.org/10.2481/...
$ comment <chr> "Submitted to Data Science Journal Presented at CODATA, Beijing,\n O...
$ journal_ref <chr> "", "", "", "", "EPJ Data Science, 1:9, 2012", "", "EPJ Data Science ...
$ doi <chr> "", "10.2481/dsj.8.41", "10.2481/dsj.8.34", "", "10.1140/epjds9", "10...
$ primary_category <chr> "astro-ph", "astro-ph.IM", "astro-ph.IM", "astro-ph.IM", "cs.SI", "as...
$ categories <chr> "astro-ph", "astro-ph.IM|astro-ph.CO", "astro-ph.IM|astro-ph.CO", "as...
tally(~ year(submitted), data = DataSciencePapers)
year(submitted)
2007 2009 2011 2012 2013 2014 2015 2016 2017
1 3 3 7 12 25 52 89 8
DataSciencePapers %>% filter(year(submitted) == 2007) %>%
glimpse()
Observations: 1
Variables: 15
$ id <chr> "astro-ph/0701361v1"
$ submitted <dttm> 2007-01-12 03:28:11
$ updated <dttm> 2007-01-12 03:28:11
$ title <chr> "How to Make the Dream Come True: The Astronomers' Data Manifesto"
$ abstract <chr> " Astronomy is one of the most data-intensive of the sciences. Data ...
$ authors <chr> "Ray P Norris"
$ affiliations <chr> ""
$ link_abstract <chr> "http://arxiv.org/abs/astro-ph/0701361v1"
$ link_pdf <chr> "http://arxiv.org/pdf/astro-ph/0701361v1"
$ link_doi <chr> ""
$ comment <chr> "Submitted to Data Science Journal Presented at CODATA, Beijing,\n O...
$ journal_ref <chr> ""
$ doi <chr> ""
$ primary_category <chr> "astro-ph"
$ categories <chr> "astro-ph"
tally(~ primary_category, data = DataSciencePapers)
primary_category
astro-ph astro-ph.EP astro-ph.GA astro-ph.IM cond-mat.mtrl-sci
1 1 2 8 2
cond-mat.str-el cs.AI cs.CG cs.CL cs.CR
1 9 1 2 1
cs.CV cs.CY cs.DB cs.DC cs.DL
2 25 11 5 3
cs.DS cs.ET cs.GT cs.HC cs.IR
4 1 3 3 3
cs.IT cs.LG cs.NA cs.NE cs.NI
2 13 1 2 1
cs.OH cs.PL cs.RO cs.SE cs.SI
2 2 1 2 15
gr-qc math.AT math.CO math.HO math.OC
1 2 1 1 4
math.PR math.ST physics.chem-ph physics.comp-ph physics.data-an
3 8 1 1 2
physics.ed-ph physics.geo-ph physics.soc-ph q-bio.GN q-bio.PE
1 1 16 1 1
q-fin.EC q-fin.GN q-fin.ST stat.AP stat.CO
1 1 1 7 4
stat.ME stat.ML stat.OT
2 4 7
DataSciencePapers %>% mutate(field = str_extract(primary_category, "^[a-z,-]+")) %>%
tally(x = ~field) %>%
sort()
field
gr-qc q-bio cond-mat q-fin astro-ph math physics stat cs
1 2 3 3 12 19 22 24 114
Now using the tm package to covert the data.frame to a corpus.
library(tm)
Corpus <- with(DataSciencePapers, VCorpus(VectorSource(abstract)))
Corpus[[1]] %>% as.character() %>%
strwrap()
[1] "Astronomy is one of the most data-intensive of the sciences. Data technology is"
[2] "accelerating the quality and effectiveness of its research, and the rate of astronomical"
[3] "discovery is higher than ever. As a result, many view astronomy as being in a 'Golden"
[4] "Age', and projects such as the Virtual Observatory are amongst the most ambitious data"
[5] "projects in any field of science. But these powerful tools will be impotent unless the"
[6] "data on which they operate are of matching quality. Astronomy, like other fields of"
[7] "science, therefore needs to establish and agree on a set of guiding principles for the"
[8] "management of astronomical data. To focus this process, we are constructing a 'data"
[9] "manifesto', which proposes guidelines to maximise the rate and cost-effectiveness of"
[10] "scientific discovery."
Corpus <- Corpus %>% tm_map(stripWhitespace) %>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removeWords, stopwords("english"))
strwrap(as.character(Corpus[[1]]))
[1] "astronomy one dataintensive sciences data technology accelerating quality effectiveness"
[2] "research rate astronomical discovery higher ever result many view astronomy golden age"
[3] "projects virtual observatory amongst ambitious data projects field science powerful"
[4] "tools will impotent unless data operate matching quality astronomy like fields science"
[5] "therefore needs establish agree set guiding principles management astronomical data"
[6] "focus process constructing data manifesto proposes guidelines maximise rate"
[7] "costeffectiveness scientific discovery"
Now using the wordcloud package visualize the data. Do you see Data Science?
library(wordcloud)
wordcloud(Corpus, max.words = 30, scale = c(8, 1), colors = topo.colors(n = 30), random.color = TRUE)
Create a Document Term Matrix using tf-idf.
DTM <- DocumentTermMatrix(Corpus, control = list(weighting = weightTfIdf))
DTM
<<DocumentTermMatrix (documents: 200, terms: 5198)>>
Non-/sparse entries: 16015/1023585
Sparsity : 98%
Maximal term length: 29
Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
findFreqTerms(DTM, lowfreq = 0.8)
[1] "algorithm" "algorithms" "also" "analysis" "applications"
[6] "approach" "approaches" "article" "available" "based"
[11] "big" "can" "challenges" "classification" "complex"
[16] "computing" "course" "data" "dataset" "datasets"
[21] "describe" "design" "different" "discovery" "discuss"
[26] "features" "find" "first" "framework" "graph"
[31] "historical" "however" "hypergraphs" "identify" "important"
[36] "information" "knowledge" "language" "large" "learning"
[41] "machine" "management" "many" "matrix" "method"
[46] "methods" "model" "models" "modern" "network"
[51] "networks" "new" "novel" "number" "one"
[56] "online" "optimization" "paper" "patterns" "people"
[61] "performance" "perspective" "present" "problem" "problems"
[66] "process" "properties" "provide" "provides" "representation"
[71] "research" "results" "review" "role" "science"
[76] "sciences" "scientific" "set" "sets" "show"
[81] "social" "statistical" "statistics" "structure" "students"
[86] "study" "support" "system" "systems" "techniques"
[91] "time" "tools" "traffic" "two" "use"
[96] "used" "users" "using" "various" "well"
[101] "wikipedia" "will" "work"
DTM %>% as.matrix() %>%
apply(MARGIN = 2, sum) %>%
sort(decreasing = TRUE) %>%
head(9)
big model research models science social learning statistical
1.889224 1.836078 1.701455 1.659216 1.639938 1.637129 1.601776 1.585974
information
1.574497
findAssocs(DTM, terms = "statistics", corlimit = 0.5)
$statistics
courses undergraduate introductory students appearing irrelevant judge
0.64 0.63 0.56 0.55 0.53 0.53 0.53
somehow todays inspiration standalone
0.53 0.51 0.50 0.50
findAssocs(DTM, terms = "mathematics", corlimit = 0.5)
$mathematics
conceptual light historical perspective review role modern
0.99 0.98 0.95 0.95 0.94 0.84 0.81