This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
URL <- "http://cox.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml6/sms_spam.csv"
download.file(URL, destfile = "./sms_spam.csv", method="curl")
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
100 469k 100 469k 0 0 1243k 0 --:--:-- --:--:-- --:--:-- 1245k
# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)
# examine the structure of the sms data
str(sms_raw)
'data.frame': 5559 obs. of 2 variables:
$ type: chr "ham" "ham" "ham" "spam" ...
$ text: chr "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline "| __truncated__ ...
# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)
# examine the type variable more carefully
str(sms_raw$type)
Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
table(sms_raw$type)
ham spam
4812 747
# build a corpus using the text mining (tm) package
library(tm)
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
# examine the sms corpus
print(sms_corpus)
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 0
Content: documents: 5559
inspect(sms_corpus[1:2])
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 0
Content: documents: 2
[[1]]
<<PlainTextDocument>>
Metadata: 7
Content: chars: 49
[[2]]
<<PlainTextDocument>>
Metadata: 7
Content: chars: 23
as.character(sms_corpus[[1]])
[1] "Hope you are having a good week. Just checking in"
lapply(sms_corpus[1:2], as.character)
$`1`
[1] "Hope you are having a good week. Just checking in"
$`2`
[1] "K..give back my thanks."
# clean up the corpus using tm_map()
sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower))
# show the difference between sms_corpus and corpus_clean
as.character(sms_corpus[[1]])
[1] "Hope you are having a good week. Just checking in"
as.character(sms_corpus_clean[[1]])
[1] "hope you are having a good week. just checking in"
sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers) # remove numbers
sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) # remove stop words
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation
# tip: create a custom function to replace (rather than remove) punctuation
removePunctuation("hello...world")
[1] "helloworld"
replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) }
replacePunctuation("hello...world")
[1] "hello world"
# illustration of word stemming
library(SnowballC)
wordStem(c("learn", "learned", "learning", "learns"))
[1] "learn" "learn" "learn" "learn"
sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)
sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) # eliminate unneeded whitespace
# examine the final clean corpus
lapply(sms_corpus[1:3], as.character)
$`1`
[1] "Hope you are having a good week. Just checking in"
$`2`
[1] "K..give back my thanks."
$`3`
[1] "Am also doing in cbe only. But have to pay."
lapply(sms_corpus_clean[1:3], as.character)
$`1`
[1] "hope good week just check"
$`2`
[1] "kgive back thank"
$`3`
[1] "also cbe pay"
# create a document-term sparse matrix
sms_dtm <- DocumentTermMatrix(sms_corpus_clean)
# alternative solution: create a document-term sparse matrix directly from the SMS corpus
sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stemming = TRUE
))
# alternative solution: using custom stop words function ensures identical result
sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = function(x) { removeWords(x, stopwords()) },
removePunctuation = TRUE,
stemming = TRUE
))
# compare the result
sms_dtm
<<DocumentTermMatrix (documents: 5559, terms: 6559)>>
Non-/sparse entries: 42147/36419334
Sparsity : 100%
Maximal term length: 40
Weighting : term frequency (tf)
sms_dtm2
<<DocumentTermMatrix (documents: 5559, terms: 6961)>>
Non-/sparse entries: 43221/38652978
Sparsity : 100%
Maximal term length: 40
Weighting : term frequency (tf)
sms_dtm3
<<DocumentTermMatrix (documents: 5559, terms: 6559)>>
Non-/sparse entries: 42147/36419334
Sparsity : 100%
Maximal term length: 40
Weighting : term frequency (tf)
# creating training and test datasets
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test <- sms_dtm[4170:5559, ]
# also save the labels
sms_train_labels <- sms_raw[1:4169, ]$type
sms_test_labels <- sms_raw[4170:5559, ]$type
# check that the proportion of spam is similar
prop.table(table(sms_train_labels))
sms_train_labels
ham spam
0.8647158 0.1352842
prop.table(table(sms_test_labels))
sms_test_labels
ham spam
0.8683453 0.1316547
# word cloud visualization
library(wordcloud)
wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)
# subset the training data into spam and ham groups
spam <- subset(sms_raw, type == "spam")
ham <- subset(sms_raw, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))
transformation drops documentstransformation drops documents
wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))
transformation drops documentstransformation drops documents
sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999)
sms_dtm_freq_train
<<DocumentTermMatrix (documents: 4169, terms: 1104)>>
Non-/sparse entries: 24827/4577749
Sparsity : 99%
Maximal term length: 19
Weighting : term frequency (tf)
# indicator features for frequent words
findFreqTerms(sms_dtm_train, 5)
[1] "£wk" "€˜m" "€˜s" "abiola" "abl"
[6] "abt" "accept" "access" "account" "across"
[11] "act" "activ" "actual" "add" "address"
[16] "admir" "adult" "advanc" "aft" "afternoon"
[21] "age" "ago" "aha" "ahead" "aight"
[26] "aint" "air" "aiyo" "alex" "almost"
[31] "alon" "alreadi" "alright" "also" "alway"
[36] "angri" "announc" "anoth" "answer" "anymor"
[41] "anyon" "anyth" "anytim" "anyway" "apart"
[46] "app" "appli" "appreci" "arcad" "ard"
[51] "area" "argu" "argument" "armand" "around"
[56] "arrang" "arriv" "asap" "ask" "askd"
[61] "attempt" "auction" "avail" "ave" "avoid"
[66] "await" "awak" "award" "away" "awesom"
[71] "babe" "babi" "back" "bad" "bag"
[76] "bank" "bare" "basic" "bath" "batteri"
[81] "bcoz" "bday" "beauti" "becom" "bed"
[86] "bedroom" "beer" "begin" "believ" "best"
[91] "better" "bid" "big" "bill" "bird"
[96] "birthday" "bit" "black" "blank" "bless"
[101] "blue" "bluetooth" "bold" "bonus" "boo"
[106] "book" "boost" "bore" "boss" "bother"
[111] "bout" "box" "boy" "boytoy" "break"
[116] "breath" "bring" "brother" "bslvyl" "btnationalr"
[121] "buck" "bus" "busi" "buy" "cabin"
[126] "call" "caller" "callertun" "camcord" "came"
[131] "camera" "campus" "can" "cancel" "cancer"
[136] "cant" "car" "card" "care" "carlo"
[141] "case" "cash" "cashbal" "catch" "caus"
[146] "celebr" "cell" "centr" "chanc" "chang"
[151] "charg" "chat" "cheap" "cheaper" "check"
[156] "cheer" "chennai" "chikku" "childish" "children"
[161] "choic" "choos" "christma" "claim" "class"
[166] "clean" "clear" "close" "club" "code"
[171] "coffe" "cold" "colleagu" "collect" "colleg"
[176] "colour" "come" "comin" "comp" "compani"
[181] "competit" "complet" "complimentari" "comput" "condit"
[186] "confirm" "congrat" "congratul" "connect" "contact"
[191] "content" "contract" "cook" "cool" "copi"
[196] "correct" "cos" "cost" "cost£pm" "costa"
[201] "coupl" "cours" "cover" "coz" "crave"
[206] "crazi" "creat" "credit" "cri" "cross"
[211] "cuddl" "cum" "cup" "current" "custcar"
[216] "custom" "cut" "cute" "cuz" "dad"
[221] "daddi" "darl" "darlin" "darren" "dat"
[226] "date" "day" "dead" "deal" "dear"
[231] "decid" "decim" "decis" "deep" "definit"
[236] "del" "deliv" "deliveri" "den" "depend"
[241] "detail" "didnt" "die" "diet" "differ"
[246] "difficult" "digit" "din" "dinner" "direct"
[251] "dis" "discount" "discuss" "disturb" "dnt"
[256] "doc" "doctor" "doesnt" "dog" "doin"
[261] "don" "done" "dont" "door" "doubl"
[266] "download" "draw" "dream" "drink" "drive"
[271] "drop" "drug" "dude" "due" "dun"
[276] "dunno" "dvd" "earli" "earlier" "earth"
[281] "easi" "eat" "eatin" "egg" "either"
[286] "els" "email" "embarass" "end" "energi"
[291] "england" "enjoy" "enough" "enter" "entitl"
[296] "entri" "envelop" "etc" "euro" "eve"
[301] "even" "ever" "everi" "everybodi" "everyon"
[306] "everyth" "exact" "exam" "excel" "excit"
[311] "excus" "expect" "experi" "expir" "extra"
[316] "eye" "face" "facebook" "fact" "fall"
[321] "famili" "fanci" "fantasi" "fantast" "far"
[326] "fast" "fat" "father" "fault" "feb"
[331] "feel" "felt" "fetch" "fight" "figur"
[336] "file" "fill" "film" "final" "find"
[341] "fine" "finger" "finish" "first" "fix"
[346] "flag" "flat" "flight" "flower" "follow"
[351] "fone" "food" "forev" "forget" "forgot"
[356] "forward" "found" "freak" "free" "freemsg"
[361] "freephon" "fren" "fri" "friday" "friend"
[366] "friendship" "frm" "frnd" "frnds" "full"
[371] "fullonsmscom" "fun" "funni" "futur" "gal"
[376] "game" "gap" "gas" "gave" "gay"
[381] "gentl" "get" "gettin" "gift" "girl"
[386] "girlfrnd" "give" "glad" "god" "goe"
[391] "goin" "gone" "gonna" "good" "goodmorn"
[396] "goodnight" "got" "goto" "gotta" "great"
[401] "grin" "guarante" "gud" "guess" "guy"
[406] "gym" "haf" "haha" "hai" "hair"
[411] "half" "hand" "handset" "hang" "happen"
[416] "happi" "hard" "hate" "hav" "havent"
[421] "head" "hear" "heard" "heart" "heavi"
[426] "hee" "hell" "hello" "help" "hey"
[431] "hgsuiteland" "hit" "hiya" "hmm" "hmmm"
[436] "hmv" "hol" "hold" "holder" "holiday"
[441] "home" "hook" "hop" "hope" "horni"
[446] "hospit" "hot" "hotel" "hour" "hous"
[451] "how" "howev" "howz" "hrs" "httpwwwurawinnercom"
[456] "hug" "huh" "hungri" "hurri" "hurt"
[461] "ice" "idea" "identifi" "ignor" "ill"
[466] "immedi" "import" "inc" "includ" "india"
[471] "info" "inform" "insid" "instead" "interest"
[476] "invit" "ipod" "irrit" "ish" "island"
[481] "issu" "ive" "izzit" "januari" "jay"
[486] "job" "john" "join" "joke" "joy"
[491] "jst" "jus" "just" "juz" "kate"
[496] "keep" "kept" "kick" "kid" "kill"
[501] "kind" "kinda" "king" "kiss" "knew"
[506] "know" "knw" "ladi" "land" "landlin"
[511] "laptop" "lar" "last" "late" "later"
[516] "latest" "laugh" "lazi" "ldn" "lead"
[521] "learn" "least" "leav" "lect" "left"
[526] "leh" "lei" "less" "lesson" "let"
[531] "letter" "liao" "librari" "lie" "life"
[536] "lift" "light" "like" "line" "link"
[541] "list" "listen" "littl" "live" "lmao"
[546] "load" "loan" "local" "locat" "log"
[551] "lol" "london" "long" "longer" "look"
[556] "lookin" "lor" "lose" "lost" "lot"
[561] "lovabl" "love" "lover" "loyalti" "ltd"
[566] "luck" "lucki" "lunch" "luv" "mad"
[571] "made" "mah" "mail" "make" "malaria"
[576] "man" "mani" "march" "mark" "marri"
[581] "match" "mate" "matter" "maxim" "maxmin"
[586] "may" "mayb" "meal" "mean" "meant"
[591] "med" "medic" "meet" "meetin" "meh"
[596] "member" "men" "merri" "messag" "met"
[601] "mid" "midnight" "might" "min" "mind"
[606] "mine" "minut" "miracl" "miss" "mistak"
[611] "moan" "mob" "mobil" "mobileupd" "mode"
[616] "mom" "moment" "mon" "monday" "money"
[621] "month" "morn" "mother" "motorola" "move"
[626] "movi" "mrng" "mrt" "mrw" "msg"
[631] "msgs" "mths" "much" "mum" "murder"
[636] "music" "must" "muz" "nah" "nake"
[641] "name" "nation" "natur" "naughti" "near"
[646] "need" "net" "network" "neva" "never"
[651] "new" "news" "next" "nice" "nigeria"
[656] "night" "nite" "nobodi" "noe" "nokia"
[661] "noon" "nope" "normal" "normpton" "noth"
[666] "notic" "now" "num" "number" "nyt"
[671] "obvious" "offer" "offic" "offici" "okay"
[676] "oki" "old" "omg" "one" "onlin"
[681] "onto" "oop" "open" "oper" "opinion"
[686] "opt" "optout" "orang" "orchard" "order"
[691] "oredi" "oso" "other" "otherwis" "outsid"
[696] "pack" "page" "paid" "pain" "paper"
[701] "parent" "park" "part" "parti" "partner"
[706] "pass" "passion" "password" "past" "pay"
[711] "peopl" "per" "person" "pete" "phone"
[716] "photo" "pic" "pick" "pictur" "pin"
[721] "piss" "pix" "pizza" "place" "plan"
[726] "play" "player" "pleas" "pleasur" "plenti"
[731] "pls" "plus" "plz" "pmin" "pmsg"
[736] "pobox" "point" "poli" "polic" "poor"
[741] "pop" "possess" "possibl" "post" "pound"
[746] "power" "ppm" "pray" "present" "press"
[751] "pretti" "previous" "price" "princess" "privat"
[756] "prize" "prob" "probabl" "problem" "project"
[761] "promis" "pub" "put" "qualiti" "question"
[766] "quick" "quit" "quiz" "quot" "rain"
[771] "random" "rang" "rate" "rather" "rcvd"
[776] "reach" "read" "readi" "real" "reali"
[781] "realli" "reason" "receipt" "receiv" "recent"
[786] "record" "refer" "regard" "regist" "relat"
[791] "relax" "remain" "rememb" "remind" "remov"
[796] "rent" "rental" "repli" "repres" "request"
[801] "respond" "respons" "rest" "result" "return"
[806] "reveal" "review" "reward" "right" "ring"
[811] "rington" "rite" "road" "rock" "role"
[816] "room" "roommat" "rose" "round" "rowwjhl"
[821] "rpli" "rreveal" "run" "rush" "sad"
[826] "sae" "safe" "said" "sale" "sat"
[831] "saturday" "savamob" "save" "saw" "say"
[836] "sch" "school" "scream" "sea" "search"
[841] "sec" "second" "secret" "see" "seem"
[846] "seen" "select" "self" "sell" "semest"
[851] "send" "sens" "sent" "serious" "servic"
[856] "set" "settl" "sex" "sexi" "shall"
[861] "share" "shd" "ship" "shirt" "shop"
[866] "short" "show" "shower" "sick" "side"
[871] "sigh" "sight" "sign" "silent" "simpl"
[876] "sinc" "singl" "sipix" "sir" "sis"
[881] "sister" "sit" "situat" "skxh" "skype"
[886] "slave" "sleep" "slept" "slow" "slowli"
[891] "small" "smile" "smoke" "sms" "smth"
[896] "snow" "sofa" "sol" "somebodi" "someon"
[901] "someth" "sometim" "somewher" "song" "soni"
[906] "sonyericsson" "soon" "sorri" "sort" "sound"
[911] "south" "space" "speak" "special" "specialcal"
[916] "spend" "spent" "spoke" "spree" "stand"
[921] "start" "statement" "station" "stay" "std"
[926] "step" "still" "stockport" "stone" "stop"
[931] "store" "stori" "street" "student" "studi"
[936] "stuff" "stupid" "style" "sub" "subscrib"
[941] "success" "suck" "suit" "summer" "sun"
[946] "sunday" "sunshin" "sup" "support" "suppos"
[951] "sure" "surf" "surpris" "sweet" "swing"
[956] "system" "take" "talk" "tampa" "tariff"
[961] "tcs" "tea" "teach" "tear" "teas"
[966] "tel" "tell" "ten" "tenerif" "term"
[971] "test" "text" "thank" "thanx" "that"
[976] "thing" "think" "thinkin" "thk" "tho"
[981] "though" "thought" "throw" "thru" "tht"
[986] "thur" "tick" "ticket" "til" "till"
[991] "time" "tire" "titl" "tmr" "toclaim"
[996] "today" "togeth" "told" "tomo" "tomorrow"
[ reached getOption("max.print") -- omitted 139 entries ]
# save frequently-appearing terms to a character vector
sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)
chr [1:1139] "£wk" "€˜m" "€˜s" "abiola" "abl" "abt" "accept" "access" "account" "across" "act" "activ" "actual" "add" ...
# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]
# convert counts to a factor
convert_counts <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)
library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
sms_test_pred <- predict(sms_classifier, sms_test)
head(sms_test_pred)
[1] ham ham ham ham spam ham
Levels: ham spam
library(gmodels)
CrossTable(sms_test_pred, sms_test_labels,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
Cell Contents
|-------------------------|
| N |
| N / Col Total |
|-------------------------|
Total Observations in Table: 1390
| actual
predicted | ham | spam | Row Total |
-------------|-----------|-----------|-----------|
ham | 1201 | 30 | 1231 |
| 0.995 | 0.164 | |
-------------|-----------|-----------|-----------|
spam | 6 | 153 | 159 |
| 0.005 | 0.836 | |
-------------|-----------|-----------|-----------|
Column Total | 1207 | 183 | 1390 |
| 0.868 | 0.132 | |
-------------|-----------|-----------|-----------|
sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_labels,
prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
dnn = c('predicted', 'actual'))
Cell Contents
|-------------------------|
| N |
| N / Col Total |
|-------------------------|
Total Observations in Table: 1390
| actual
predicted | ham | spam | Row Total |
-------------|-----------|-----------|-----------|
ham | 1202 | 28 | 1230 |
| 0.996 | 0.153 | |
-------------|-----------|-----------|-----------|
spam | 5 | 155 | 160 |
| 0.004 | 0.847 | |
-------------|-----------|-----------|-----------|
Column Total | 1207 | 183 | 1390 |
| 0.868 | 0.132 | |
-------------|-----------|-----------|-----------|