aRxiv

This is R code from Modern Data Science with R, Chapter 15 Text as data.

In Section 15.2 Analyzing textual data there is an example where research papers related to Data Science are downloaded from aRxiv and summarized.

library(tidyverse)
library(mdsr)
library(aRxiv)

DataSciencePapers <- arxiv_search(query = '"Data Science"', limit = 200)

retrieved batch 1
retrieved batch 2

head(DataSciencePapers)

library(lubridate) 
DataSciencePapers <- DataSciencePapers %>%
  mutate(submitted = ymd_hms(submitted), updated = ymd_hms(updated)) 
glimpse(DataSciencePapers)

Observations: 200
Variables: 15
$ id               <chr> "astro-ph/0701361v1", "0901.2805v1", "0901.3118v2", "0909.3895v1", "1...
$ submitted        <dttm> 2007-01-12 03:28:11, 2009-01-19 10:38:33, 2009-01-20 18:48:59, 2009-...
$ updated          <dttm> 2007-01-12 03:28:11, 2009-01-19 10:38:33, 2009-01-24 19:23:47, 2009-...
$ title            <chr> "How to Make the Dream Come True: The Astronomers' Data Manifesto", "...
$ abstract         <chr> "  Astronomy is one of the most data-intensive of the sciences. Data ...
$ authors          <chr> "Ray P Norris", "Heinz Andernach", "O. V. Verkhodanov|S. A. Trushkin|...
$ affiliations     <chr> "", "", "Special Astrophysical Observatory, Nizhnij Arkhyz, Karachaj-...
$ link_abstract    <chr> "http://arxiv.org/abs/astro-ph/0701361v1", "http://arxiv.org/abs/0901...
$ link_pdf         <chr> "http://arxiv.org/pdf/astro-ph/0701361v1", "http://arxiv.org/pdf/0901...
$ link_doi         <chr> "", "http://dx.doi.org/10.2481/dsj.8.41", "http://dx.doi.org/10.2481/...
$ comment          <chr> "Submitted to Data Science Journal Presented at CODATA, Beijing,\n  O...
$ journal_ref      <chr> "", "", "", "", "EPJ Data Science, 1:9, 2012", "", "EPJ Data Science ...
$ doi              <chr> "", "10.2481/dsj.8.41", "10.2481/dsj.8.34", "", "10.1140/epjds9", "10...
$ primary_category <chr> "astro-ph", "astro-ph.IM", "astro-ph.IM", "astro-ph.IM", "cs.SI", "as...
$ categories       <chr> "astro-ph", "astro-ph.IM|astro-ph.CO", "astro-ph.IM|astro-ph.CO", "as...

tally(~ year(submitted), data = DataSciencePapers)

year(submitted)
2007 2009 2011 2012 2013 2014 2015 2016 2017 
   1    3    3    7   12   25   52   89    8

DataSciencePapers %>% filter(year(submitted) == 2007) %>% 
  glimpse()

Observations: 1
Variables: 15
$ id               <chr> "astro-ph/0701361v1"
$ submitted        <dttm> 2007-01-12 03:28:11
$ updated          <dttm> 2007-01-12 03:28:11
$ title            <chr> "How to Make the Dream Come True: The Astronomers' Data Manifesto"
$ abstract         <chr> "  Astronomy is one of the most data-intensive of the sciences. Data ...
$ authors          <chr> "Ray P Norris"
$ affiliations     <chr> ""
$ link_abstract    <chr> "http://arxiv.org/abs/astro-ph/0701361v1"
$ link_pdf         <chr> "http://arxiv.org/pdf/astro-ph/0701361v1"
$ link_doi         <chr> ""
$ comment          <chr> "Submitted to Data Science Journal Presented at CODATA, Beijing,\n  O...
$ journal_ref      <chr> ""
$ doi              <chr> ""
$ primary_category <chr> "astro-ph"
$ categories       <chr> "astro-ph"

tally(~ primary_category, data = DataSciencePapers)

primary_category
         astro-ph       astro-ph.EP       astro-ph.GA       astro-ph.IM cond-mat.mtrl-sci 
                1                 1                 2                 8                 2 
  cond-mat.str-el             cs.AI             cs.CG             cs.CL             cs.CR 
                1                 9                 1                 2                 1 
            cs.CV             cs.CY             cs.DB             cs.DC             cs.DL 
                2                25                11                 5                 3 
            cs.DS             cs.ET             cs.GT             cs.HC             cs.IR 
                4                 1                 3                 3                 3 
            cs.IT             cs.LG             cs.NA             cs.NE             cs.NI 
                2                13                 1                 2                 1 
            cs.OH             cs.PL             cs.RO             cs.SE             cs.SI 
                2                 2                 1                 2                15 
            gr-qc           math.AT           math.CO           math.HO           math.OC 
                1                 2                 1                 1                 4 
          math.PR           math.ST   physics.chem-ph   physics.comp-ph   physics.data-an 
                3                 8                 1                 1                 2 
    physics.ed-ph    physics.geo-ph    physics.soc-ph          q-bio.GN          q-bio.PE 
                1                 1                16                 1                 1 
         q-fin.EC          q-fin.GN          q-fin.ST           stat.AP           stat.CO 
                1                 1                 1                 7                 4 
          stat.ME           stat.ML           stat.OT 
                2                 4                 7

DataSciencePapers %>% mutate(field = str_extract(primary_category, "^[a-z,-]+")) %>% 
  tally(x = ~field) %>% 
  sort()

field
   gr-qc    q-bio cond-mat    q-fin astro-ph     math  physics     stat       cs 
       1        2        3        3       12       19       22       24      114

Now using the tm package to covert the data.frame to a corpus.

library(tm) 
Corpus <- with(DataSciencePapers, VCorpus(VectorSource(abstract))) 
Corpus[[1]] %>% as.character() %>% 
  strwrap()

 [1] "Astronomy is one of the most data-intensive of the sciences. Data technology is"         
 [2] "accelerating the quality and effectiveness of its research, and the rate of astronomical"
 [3] "discovery is higher than ever. As a result, many view astronomy as being in a 'Golden"   
 [4] "Age', and projects such as the Virtual Observatory are amongst the most ambitious data"  
 [5] "projects in any field of science. But these powerful tools will be impotent unless the"  
 [6] "data on which they operate are of matching quality. Astronomy, like other fields of"     
 [7] "science, therefore needs to establish and agree on a set of guiding principles for the"  
 [8] "management of astronomical data. To focus this process, we are constructing a 'data"     
 [9] "manifesto', which proposes guidelines to maximise the rate and cost-effectiveness of"    
[10] "scientific discovery."

Corpus <- Corpus %>% tm_map(stripWhitespace) %>% 
  tm_map(removeNumbers) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removeWords, stopwords("english"))
strwrap(as.character(Corpus[[1]]))

[1] "astronomy one dataintensive sciences data technology accelerating quality effectiveness"
[2] "research rate astronomical discovery higher ever result many view astronomy golden age" 
[3] "projects virtual observatory amongst ambitious data projects field science powerful"    
[4] "tools will impotent unless data operate matching quality astronomy like fields science" 
[5] "therefore needs establish agree set guiding principles management astronomical data"    
[6] "focus process constructing data manifesto proposes guidelines maximise rate"            
[7] "costeffectiveness scientific discovery"

Now using the wordcloud package visualize the data. Do you see Data Science?

library(wordcloud) 
wordcloud(Corpus, max.words = 30, scale = c(8, 1), colors = topo.colors(n = 30), random.color = TRUE)

Create a Document Term Matrix using tf-idf.

DTM <- DocumentTermMatrix(Corpus, control = list(weighting = weightTfIdf)) 
DTM

<<DocumentTermMatrix (documents: 200, terms: 5198)>>
Non-/sparse entries: 16015/1023585
Sparsity           : 98%
Maximal term length: 29
Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)

findFreqTerms(DTM, lowfreq = 0.8)

  [1] "algorithm"      "algorithms"     "also"           "analysis"       "applications"  
  [6] "approach"       "approaches"     "article"        "available"      "based"         
 [11] "big"            "can"            "challenges"     "classification" "complex"       
 [16] "computing"      "course"         "data"           "dataset"        "datasets"      
 [21] "describe"       "design"         "different"      "discovery"      "discuss"       
 [26] "features"       "find"           "first"          "framework"      "graph"         
 [31] "historical"     "however"        "hypergraphs"    "identify"       "important"     
 [36] "information"    "knowledge"      "language"       "large"          "learning"      
 [41] "machine"        "management"     "many"           "matrix"         "method"        
 [46] "methods"        "model"          "models"         "modern"         "network"       
 [51] "networks"       "new"            "novel"          "number"         "one"           
 [56] "online"         "optimization"   "paper"          "patterns"       "people"        
 [61] "performance"    "perspective"    "present"        "problem"        "problems"      
 [66] "process"        "properties"     "provide"        "provides"       "representation"
 [71] "research"       "results"        "review"         "role"           "science"       
 [76] "sciences"       "scientific"     "set"            "sets"           "show"          
 [81] "social"         "statistical"    "statistics"     "structure"      "students"      
 [86] "study"          "support"        "system"         "systems"        "techniques"    
 [91] "time"           "tools"          "traffic"        "two"            "use"           
 [96] "used"           "users"          "using"          "various"        "well"          
[101] "wikipedia"      "will"           "work"

DTM %>% as.matrix() %>% 
  apply(MARGIN = 2, sum) %>% 
  sort(decreasing = TRUE) %>% 
  head(9)

        big       model    research      models     science      social    learning statistical 
   1.889224    1.836078    1.701455    1.659216    1.639938    1.637129    1.601776    1.585974 
information 
   1.574497

findAssocs(DTM, terms = "statistics", corlimit = 0.5)

$statistics
      courses undergraduate  introductory      students     appearing    irrelevant         judge 
         0.64          0.63          0.56          0.55          0.53          0.53          0.53 
      somehow        todays   inspiration    standalone 
         0.53          0.51          0.50          0.50

findAssocs(DTM, terms = "mathematics", corlimit = 0.5)

$mathematics
 conceptual       light  historical perspective      review        role      modern 
       0.99        0.98        0.95        0.95        0.94        0.84        0.81

LS0tCnRpdGxlOiAiYVJ4aXYiCm91dHB1dDoKICBodG1sX25vdGVib29rOiBkZWZhdWx0CiAgcGRmX2RvY3VtZW50OiBkZWZhdWx0Ci0tLQoKVGhpcyBpcyBSIGNvZGUgZnJvbSBNb2Rlcm4gRGF0YSBTY2llbmNlIHdpdGggUiwgQ2hhcHRlciAxNSBUZXh0IGFzIGRhdGEuCgpJbiBTZWN0aW9uIDE1LjIgQW5hbHl6aW5nIHRleHR1YWwgZGF0YSB0aGVyZSBpcyBhbiBleGFtcGxlIHdoZXJlIHJlc2VhcmNoIHBhcGVycyByZWxhdGVkIHRvIERhdGEgU2NpZW5jZSBhcmUgZG93bmxvYWRlZCBmcm9tIGFSeGl2IGFuZCBzdW1tYXJpemVkLgoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KG1kc3IpCmxpYnJhcnkoYVJ4aXYpCmBgYAoKYGBge3J9CkRhdGFTY2llbmNlUGFwZXJzIDwtIGFyeGl2X3NlYXJjaChxdWVyeSA9ICciRGF0YSBTY2llbmNlIicsIGxpbWl0ID0gMjAwKQpoZWFkKERhdGFTY2llbmNlUGFwZXJzKQpgYGAKCgpgYGB7cn0KbGlicmFyeShsdWJyaWRhdGUpIAoKRGF0YVNjaWVuY2VQYXBlcnMgPC0gRGF0YVNjaWVuY2VQYXBlcnMgJT4lCiAgbXV0YXRlKHN1Ym1pdHRlZCA9IHltZF9obXMoc3VibWl0dGVkKSwgdXBkYXRlZCA9IHltZF9obXModXBkYXRlZCkpIApnbGltcHNlKERhdGFTY2llbmNlUGFwZXJzKQoKYGBgCgpgYGB7cn0KdGFsbHkofiB5ZWFyKHN1Ym1pdHRlZCksIGRhdGEgPSBEYXRhU2NpZW5jZVBhcGVycykKYGBgCgoKYGBge3J9CkRhdGFTY2llbmNlUGFwZXJzICU+JSBmaWx0ZXIoeWVhcihzdWJtaXR0ZWQpID09IDIwMDcpICU+JSAKICBnbGltcHNlKCkKYGBgCgpgYGB7cn0KdGFsbHkofiBwcmltYXJ5X2NhdGVnb3J5LCBkYXRhID0gRGF0YVNjaWVuY2VQYXBlcnMpCmBgYAoKYGBge3J9CkRhdGFTY2llbmNlUGFwZXJzICU+JSBtdXRhdGUoZmllbGQgPSBzdHJfZXh0cmFjdChwcmltYXJ5X2NhdGVnb3J5LCAiXlthLXosLV0rIikpICU+JSAKICB0YWxseSh4ID0gfmZpZWxkKSAlPiUgCiAgc29ydCgpCgpgYGAKCk5vdyB1c2luZyB0aGUgKnRtKiBwYWNrYWdlIHRvIGNvdmVydCB0aGUgZGF0YS5mcmFtZSB0byBhIGNvcnB1cy4KCmBgYHtyfQpsaWJyYXJ5KHRtKSAKCkNvcnB1cyA8LSB3aXRoKERhdGFTY2llbmNlUGFwZXJzLCBWQ29ycHVzKFZlY3RvclNvdXJjZShhYnN0cmFjdCkpKSAKQ29ycHVzW1sxXV0gJT4lIGFzLmNoYXJhY3RlcigpICU+JSAKICBzdHJ3cmFwKCkKYGBgCgpgYGB7cn0KCkNvcnB1cyA8LSBDb3JwdXMgJT4lIHRtX21hcChzdHJpcFdoaXRlc3BhY2UpICU+JSAKICB0bV9tYXAocmVtb3ZlTnVtYmVycykgJT4lIAogIHRtX21hcChyZW1vdmVQdW5jdHVhdGlvbikgJT4lIAogIHRtX21hcChjb250ZW50X3RyYW5zZm9ybWVyKHRvbG93ZXIpKSAlPiUgCiAgdG1fbWFwKHJlbW92ZVdvcmRzLCBzdG9wd29yZHMoImVuZ2xpc2giKSkKc3Ryd3JhcChhcy5jaGFyYWN0ZXIoQ29ycHVzW1sxXV0pKQoKYGBgCgpOb3cgdXNpbmcgdGhlICp3b3JkY2xvdWQqIHBhY2thZ2UgdmlzdWFsaXplIHRoZSBkYXRhLiAgRG8geW91IHNlZSBEYXRhIFNjaWVuY2U/CgpgYGB7cn0KbGlicmFyeSh3b3JkY2xvdWQpIAoKd29yZGNsb3VkKENvcnB1cywgbWF4LndvcmRzID0gMzAsIHNjYWxlID0gYyg4LCAxKSwgY29sb3JzID0gdG9wby5jb2xvcnMobiA9IDMwKSwgcmFuZG9tLmNvbG9yID0gVFJVRSkKCmBgYAoKQ3JlYXRlIGEgRG9jdW1lbnQgVGVybSBNYXRyaXggdXNpbmcgdGYtaWRmLgoKYGBge3J9CkRUTSA8LSBEb2N1bWVudFRlcm1NYXRyaXgoQ29ycHVzLCBjb250cm9sID0gbGlzdCh3ZWlnaHRpbmcgPSB3ZWlnaHRUZklkZikpIApEVE0KYGBgCgpgYGB7cn0KZmluZEZyZXFUZXJtcyhEVE0sIGxvd2ZyZXEgPSAwLjgpCmBgYAoKYGBge3J9CkRUTSAlPiUgYXMubWF0cml4KCkgJT4lIAogIGFwcGx5KE1BUkdJTiA9IDIsIHN1bSkgJT4lIAogIHNvcnQoZGVjcmVhc2luZyA9IFRSVUUpICU+JSAKICBoZWFkKDkpCgpgYGAKCgoKYGBge3J9CmZpbmRBc3NvY3MoRFRNLCB0ZXJtcyA9ICJzdGF0aXN0aWNzIiwgY29ybGltaXQgPSAwLjUpCmBgYAoKYGBge3J9CmZpbmRBc3NvY3MoRFRNLCB0ZXJtcyA9ICJtYXRoZW1hdGljcyIsIGNvcmxpbWl0ID0gMC41KQpgYGAKCgo=