This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

Example: Filtering spam SMS messages

Step 1: Download the data

URL <- "http://www.sci.csueastbay.edu/~esuess/classes/Statistics_6620/Presentations/ml6/sms_spam.csv"
download.file(URL, destfile = "./sms_spam.csv", method="curl")
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  3  469k    3 18534    0     0   129k      0  0:00:03 --:--:--  0:00:03  129k
100  469k  100  469k    0     0  1005k      0 --:--:-- --:--:-- --:--:-- 1005k

Step 2: Exploring and preparing the data —-

# read the sms data into the sms data frame
sms_raw <- read.csv("sms_spam.csv", stringsAsFactors = FALSE)
# examine the structure of the sms data
str(sms_raw)
'data.frame':   5559 obs. of  2 variables:
 $ type: chr  "ham" "ham" "ham" "spam" ...
 $ text: chr  "Hope you are having a good week. Just checking in" "K..give back my thanks." "Am also doing in cbe only. But have to pay." "complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline "| __truncated__ ...
# convert spam/ham to factor.
sms_raw$type <- factor(sms_raw$type)
# examine the type variable more carefully
str(sms_raw$type)
 Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
table(sms_raw$type)

 ham spam 
4812  747 
# build a corpus using the text mining (tm) package
library(tm)
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
# examine the sms corpus
print(sms_corpus)
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 5559
inspect(sms_corpus[1:2])
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 2

[[1]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 49

[[2]]
<<PlainTextDocument>>
Metadata:  7
Content:  chars: 23
as.character(sms_corpus[[1]])
[1] "Hope you are having a good week. Just checking in"
lapply(sms_corpus[1:2], as.character)
$`1`
[1] "Hope you are having a good week. Just checking in"

$`2`
[1] "K..give back my thanks."
# clean up the corpus using tm_map()
sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower))
# show the difference between sms_corpus and corpus_clean
as.character(sms_corpus[[1]])
[1] "Hope you are having a good week. Just checking in"
as.character(sms_corpus_clean[[1]])
[1] "hope you are having a good week. just checking in"
sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers) # remove numbers
sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) # remove stop words
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation
# tip: create a custom function to replace (rather than remove) punctuation
removePunctuation("hello...world")
[1] "helloworld"
replacePunctuation <- function(x) { gsub("[[:punct:]]+", " ", x) }
replacePunctuation("hello...world")
[1] "hello world"
# illustration of word stemming
library(SnowballC)
wordStem(c("learn", "learned", "learning", "learns"))
[1] "learn" "learn" "learn" "learn"
sms_corpus_clean <- tm_map(sms_corpus_clean, stemDocument)
sms_corpus_clean <- tm_map(sms_corpus_clean, stripWhitespace) # eliminate unneeded whitespace
# examine the final clean corpus
lapply(sms_corpus[1:3], as.character)
$`1`
[1] "Hope you are having a good week. Just checking in"

$`2`
[1] "K..give back my thanks."

$`3`
[1] "Am also doing in cbe only. But have to pay."
lapply(sms_corpus_clean[1:3], as.character)
$`1`
[1] "hope good week just check"

$`2`
[1] "kgive back thank"

$`3`
[1] "also cbe pay"
# create a document-term sparse matrix
sms_dtm <- DocumentTermMatrix(sms_corpus_clean)
# alternative solution: create a document-term sparse matrix directly from the SMS corpus
sms_dtm2 <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = TRUE,
  removePunctuation = TRUE,
  stemming = TRUE
))
# alternative solution: using custom stop words function ensures identical result
sms_dtm3 <- DocumentTermMatrix(sms_corpus, control = list(
  tolower = TRUE,
  removeNumbers = TRUE,
  stopwords = function(x) { removeWords(x, stopwords()) },
  removePunctuation = TRUE,
  stemming = TRUE
))
# compare the result
sms_dtm
<<DocumentTermMatrix (documents: 5559, terms: 6559)>>
Non-/sparse entries: 42147/36419334
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)
sms_dtm2
<<DocumentTermMatrix (documents: 5559, terms: 6961)>>
Non-/sparse entries: 43221/38652978
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)
sms_dtm3
<<DocumentTermMatrix (documents: 5559, terms: 6559)>>
Non-/sparse entries: 42147/36419334
Sparsity           : 100%
Maximal term length: 40
Weighting          : term frequency (tf)
# creating training and test datasets
sms_dtm_train <- sms_dtm[1:4169, ]
sms_dtm_test  <- sms_dtm[4170:5559, ]
# also save the labels
sms_train_labels <- sms_raw[1:4169, ]$type
sms_test_labels  <- sms_raw[4170:5559, ]$type
# check that the proportion of spam is similar
prop.table(table(sms_train_labels))
sms_train_labels
      ham      spam 
0.8647158 0.1352842 
prop.table(table(sms_test_labels))
sms_test_labels
      ham      spam 
0.8683453 0.1316547 
# word cloud visualization
library(wordcloud)
wordcloud(sms_corpus_clean, min.freq = 50, random.order = FALSE)

# subset the training data into spam and ham groups
spam <- subset(sms_raw, type == "spam")
ham  <- subset(sms_raw, type == "ham")
wordcloud(spam$text, max.words = 40, scale = c(3, 0.5))

wordcloud(ham$text, max.words = 40, scale = c(3, 0.5))

sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train, 0.999)
sms_dtm_freq_train
<<DocumentTermMatrix (documents: 4169, terms: 1104)>>
Non-/sparse entries: 24827/4577749
Sparsity           : 99%
Maximal term length: 19
Weighting          : term frequency (tf)
# indicator features for frequent words
findFreqTerms(sms_dtm_train, 5)
   [1] "£wk"                 "€˜m"                 "€˜s"                 "abiola"             
   [5] "abl"                 "abt"                 "accept"              "access"             
   [9] "account"             "across"              "act"                 "activ"              
  [13] "actual"              "add"                 "address"             "admir"              
  [17] "adult"               "advanc"              "aft"                 "afternoon"          
  [21] "age"                 "ago"                 "aha"                 "ahead"              
  [25] "aight"               "aint"                "air"                 "aiyo"               
  [29] "alex"                "almost"              "alon"                "alreadi"            
  [33] "alright"             "also"                "alway"               "angri"              
  [37] "announc"             "anoth"               "answer"              "anymor"             
  [41] "anyon"               "anyth"               "anytim"              "anyway"             
  [45] "apart"               "app"                 "appli"               "appreci"            
  [49] "arcad"               "ard"                 "area"                "argu"               
  [53] "argument"            "armand"              "around"              "arrang"             
  [57] "arriv"               "asap"                "ask"                 "askd"               
  [61] "attempt"             "auction"             "avail"               "ave"                
  [65] "avoid"               "await"               "awak"                "award"              
  [69] "away"                "awesom"              "babe"                "babi"               
  [73] "back"                "bad"                 "bag"                 "bank"               
  [77] "bare"                "basic"               "bath"                "batteri"            
  [81] "bcoz"                "bday"                "beauti"              "becom"              
  [85] "bed"                 "bedroom"             "beer"                "begin"              
  [89] "believ"              "best"                "better"              "bid"                
  [93] "big"                 "bill"                "bird"                "birthday"           
  [97] "bit"                 "black"               "blank"               "bless"              
 [101] "blue"                "bluetooth"           "bold"                "bonus"              
 [105] "boo"                 "book"                "boost"               "bore"               
 [109] "boss"                "bother"              "bout"                "box"                
 [113] "boy"                 "boytoy"              "break"               "breath"             
 [117] "bring"               "brother"             "bslvyl"              "btnationalr"        
 [121] "buck"                "bus"                 "busi"                "buy"                
 [125] "cabin"               "call"                "caller"              "callertun"          
 [129] "camcord"             "came"                "camera"              "campus"             
 [133] "can"                 "cancel"              "cancer"              "cant"               
 [137] "car"                 "card"                "care"                "carlo"              
 [141] "case"                "cash"                "cashbal"             "catch"              
 [145] "caus"                "celebr"              "cell"                "centr"              
 [149] "chanc"               "chang"               "charg"               "chat"               
 [153] "cheap"               "cheaper"             "check"               "cheer"              
 [157] "chennai"             "chikku"              "childish"            "children"           
 [161] "choic"               "choos"               "christma"            "claim"              
 [165] "class"               "clean"               "clear"               "close"              
 [169] "club"                "code"                "coffe"               "cold"               
 [173] "colleagu"            "collect"             "colleg"              "colour"             
 [177] "come"                "comin"               "comp"                "compani"            
 [181] "competit"            "complet"             "complimentari"       "comput"             
 [185] "condit"              "confirm"             "congrat"             "congratul"          
 [189] "connect"             "contact"             "content"             "contract"           
 [193] "cook"                "cool"                "copi"                "correct"            
 [197] "cos"                 "cost"                "cost£pm"             "costa"              
 [201] "coupl"               "cours"               "cover"               "coz"                
 [205] "crave"               "crazi"               "creat"               "credit"             
 [209] "cri"                 "cross"               "cuddl"               "cum"                
 [213] "cup"                 "current"             "custcar"             "custom"             
 [217] "cut"                 "cute"                "cuz"                 "dad"                
 [221] "daddi"               "darl"                "darlin"              "darren"             
 [225] "dat"                 "date"                "day"                 "dead"               
 [229] "deal"                "dear"                "decid"               "decim"              
 [233] "decis"               "deep"                "definit"             "del"                
 [237] "deliv"               "deliveri"            "den"                 "depend"             
 [241] "detail"              "didnt"               "die"                 "diet"               
 [245] "differ"              "difficult"           "digit"               "din"                
 [249] "dinner"              "direct"              "dis"                 "discount"           
 [253] "discuss"             "disturb"             "dnt"                 "doc"                
 [257] "doctor"              "doesnt"              "dog"                 "doin"               
 [261] "don"                 "done"                "dont"                "door"               
 [265] "doubl"               "download"            "draw"                "dream"              
 [269] "drink"               "drive"               "drop"                "drug"               
 [273] "dude"                "due"                 "dun"                 "dunno"              
 [277] "dvd"                 "earli"               "earlier"             "earth"              
 [281] "easi"                "eat"                 "eatin"               "egg"                
 [285] "either"              "els"                 "email"               "embarass"           
 [289] "end"                 "energi"              "england"             "enjoy"              
 [293] "enough"              "enter"               "entitl"              "entri"              
 [297] "envelop"             "etc"                 "euro"                "eve"                
 [301] "even"                "ever"                "everi"               "everybodi"          
 [305] "everyon"             "everyth"             "exact"               "exam"               
 [309] "excel"               "excit"               "excus"               "expect"             
 [313] "experi"              "expir"               "extra"               "eye"                
 [317] "face"                "facebook"            "fact"                "fall"               
 [321] "famili"              "fanci"               "fantasi"             "fantast"            
 [325] "far"                 "fast"                "fat"                 "father"             
 [329] "fault"               "feb"                 "feel"                "felt"               
 [333] "fetch"               "fight"               "figur"               "file"               
 [337] "fill"                "film"                "final"               "find"               
 [341] "fine"                "finger"              "finish"              "first"              
 [345] "fix"                 "flag"                "flat"                "flight"             
 [349] "flower"              "follow"              "fone"                "food"               
 [353] "forev"               "forget"              "forgot"              "forward"            
 [357] "found"               "freak"               "free"                "freemsg"            
 [361] "freephon"            "fren"                "fri"                 "friday"             
 [365] "friend"              "friendship"          "frm"                 "frnd"               
 [369] "frnds"               "full"                "fullonsmscom"        "fun"                
 [373] "funni"               "futur"               "gal"                 "game"               
 [377] "gap"                 "gas"                 "gave"                "gay"                
 [381] "gentl"               "get"                 "gettin"              "gift"               
 [385] "girl"                "girlfrnd"            "give"                "glad"               
 [389] "god"                 "goe"                 "goin"                "gone"               
 [393] "gonna"               "good"                "goodmorn"            "goodnight"          
 [397] "got"                 "goto"                "gotta"               "great"              
 [401] "grin"                "guarante"            "gud"                 "guess"              
 [405] "guy"                 "gym"                 "haf"                 "haha"               
 [409] "hai"                 "hair"                "half"                "hand"               
 [413] "handset"             "hang"                "happen"              "happi"              
 [417] "hard"                "hate"                "hav"                 "havent"             
 [421] "head"                "hear"                "heard"               "heart"              
 [425] "heavi"               "hee"                 "hell"                "hello"              
 [429] "help"                "hey"                 "hgsuiteland"         "hit"                
 [433] "hiya"                "hmm"                 "hmmm"                "hmv"                
 [437] "hol"                 "hold"                "holder"              "holiday"            
 [441] "home"                "hook"                "hop"                 "hope"               
 [445] "horni"               "hospit"              "hot"                 "hotel"              
 [449] "hour"                "hous"                "how"                 "howev"              
 [453] "howz"                "hrs"                 "httpwwwurawinnercom" "hug"                
 [457] "huh"                 "hungri"              "hurri"               "hurt"               
 [461] "ice"                 "idea"                "identifi"            "ignor"              
 [465] "ill"                 "immedi"              "import"              "inc"                
 [469] "includ"              "india"               "info"                "inform"             
 [473] "insid"               "instead"             "interest"            "invit"              
 [477] "ipod"                "irrit"               "ish"                 "island"             
 [481] "issu"                "ive"                 "izzit"               "januari"            
 [485] "jay"                 "job"                 "john"                "join"               
 [489] "joke"                "joy"                 "jst"                 "jus"                
 [493] "just"                "juz"                 "kate"                "keep"               
 [497] "kept"                "kick"                "kid"                 "kill"               
 [501] "kind"                "kinda"               "king"                "kiss"               
 [505] "knew"                "know"                "knw"                 "ladi"               
 [509] "land"                "landlin"             "laptop"              "lar"                
 [513] "last"                "late"                "later"               "latest"             
 [517] "laugh"               "lazi"                "ldn"                 "lead"               
 [521] "learn"               "least"               "leav"                "lect"               
 [525] "left"                "leh"                 "lei"                 "less"               
 [529] "lesson"              "let"                 "letter"              "liao"               
 [533] "librari"             "lie"                 "life"                "lift"               
 [537] "light"               "like"                "line"                "link"               
 [541] "list"                "listen"              "littl"               "live"               
 [545] "lmao"                "load"                "loan"                "local"              
 [549] "locat"               "log"                 "lol"                 "london"             
 [553] "long"                "longer"              "look"                "lookin"             
 [557] "lor"                 "lose"                "lost"                "lot"                
 [561] "lovabl"              "love"                "lover"               "loyalti"            
 [565] "ltd"                 "luck"                "lucki"               "lunch"              
 [569] "luv"                 "mad"                 "made"                "mah"                
 [573] "mail"                "make"                "malaria"             "man"                
 [577] "mani"                "march"               "mark"                "marri"              
 [581] "match"               "mate"                "matter"              "maxim"              
 [585] "maxmin"              "may"                 "mayb"                "meal"               
 [589] "mean"                "meant"               "med"                 "medic"              
 [593] "meet"                "meetin"              "meh"                 "member"             
 [597] "men"                 "merri"               "messag"              "met"                
 [601] "mid"                 "midnight"            "might"               "min"                
 [605] "mind"                "mine"                "minut"               "miracl"             
 [609] "miss"                "mistak"              "moan"                "mob"                
 [613] "mobil"               "mobileupd"           "mode"                "mom"                
 [617] "moment"              "mon"                 "monday"              "money"              
 [621] "month"               "morn"                "mother"              "motorola"           
 [625] "move"                "movi"                "mrng"                "mrt"                
 [629] "mrw"                 "msg"                 "msgs"                "mths"               
 [633] "much"                "mum"                 "murder"              "music"              
 [637] "must"                "muz"                 "nah"                 "nake"               
 [641] "name"                "nation"              "natur"               "naughti"            
 [645] "near"                "need"                "net"                 "network"            
 [649] "neva"                "never"               "new"                 "news"               
 [653] "next"                "nice"                "nigeria"             "night"              
 [657] "nite"                "nobodi"              "noe"                 "nokia"              
 [661] "noon"                "nope"                "normal"              "normpton"           
 [665] "noth"                "notic"               "now"                 "num"                
 [669] "number"              "nyt"                 "obvious"             "offer"              
 [673] "offic"               "offici"              "okay"                "oki"                
 [677] "old"                 "omg"                 "one"                 "onlin"              
 [681] "onto"                "oop"                 "open"                "oper"               
 [685] "opinion"             "opt"                 "optout"              "orang"              
 [689] "orchard"             "order"               "oredi"               "oso"                
 [693] "other"               "otherwis"            "outsid"              "pack"               
 [697] "page"                "paid"                "pain"                "paper"              
 [701] "parent"              "park"                "part"                "parti"              
 [705] "partner"             "pass"                "passion"             "password"           
 [709] "past"                "pay"                 "peopl"               "per"                
 [713] "person"              "pete"                "phone"               "photo"              
 [717] "pic"                 "pick"                "pictur"              "pin"                
 [721] "piss"                "pix"                 "pizza"               "place"              
 [725] "plan"                "play"                "player"              "pleas"              
 [729] "pleasur"             "plenti"              "pls"                 "plus"               
 [733] "plz"                 "pmin"                "pmsg"                "pobox"              
 [737] "point"               "poli"                "polic"               "poor"               
 [741] "pop"                 "possess"             "possibl"             "post"               
 [745] "pound"               "power"               "ppm"                 "pray"               
 [749] "present"             "press"               "pretti"              "previous"           
 [753] "price"               "princess"            "privat"              "prize"              
 [757] "prob"                "probabl"             "problem"             "project"            
 [761] "promis"              "pub"                 "put"                 "qualiti"            
 [765] "question"            "quick"               "quit"                "quiz"               
 [769] "quot"                "rain"                "random"              "rang"               
 [773] "rate"                "rather"              "rcvd"                "reach"              
 [777] "read"                "readi"               "real"                "reali"              
 [781] "realli"              "reason"              "receipt"             "receiv"             
 [785] "recent"              "record"              "refer"               "regard"             
 [789] "regist"              "relat"               "relax"               "remain"             
 [793] "rememb"              "remind"              "remov"               "rent"               
 [797] "rental"              "repli"               "repres"              "request"            
 [801] "respond"             "respons"             "rest"                "result"             
 [805] "return"              "reveal"              "review"              "reward"             
 [809] "right"               "ring"                "rington"             "rite"               
 [813] "road"                "rock"                "role"                "room"               
 [817] "roommat"             "rose"                "round"               "rowwjhl"            
 [821] "rpli"                "rreveal"             "run"                 "rush"               
 [825] "sad"                 "sae"                 "safe"                "said"               
 [829] "sale"                "sat"                 "saturday"            "savamob"            
 [833] "save"                "saw"                 "say"                 "sch"                
 [837] "school"              "scream"              "sea"                 "search"             
 [841] "sec"                 "second"              "secret"              "see"                
 [845] "seem"                "seen"                "select"              "self"               
 [849] "sell"                "semest"              "send"                "sens"               
 [853] "sent"                "serious"             "servic"              "set"                
 [857] "settl"               "sex"                 "sexi"                "shall"              
 [861] "share"               "shd"                 "ship"                "shirt"              
 [865] "shop"                "short"               "show"                "shower"             
 [869] "sick"                "side"                "sigh"                "sight"              
 [873] "sign"                "silent"              "simpl"               "sinc"               
 [877] "singl"               "sipix"               "sir"                 "sis"                
 [881] "sister"              "sit"                 "situat"              "skxh"               
 [885] "skype"               "slave"               "sleep"               "slept"              
 [889] "slow"                "slowli"              "small"               "smile"              
 [893] "smoke"               "sms"                 "smth"                "snow"               
 [897] "sofa"                "sol"                 "somebodi"            "someon"             
 [901] "someth"              "sometim"             "somewher"            "song"               
 [905] "soni"                "sonyericsson"        "soon"                "sorri"              
 [909] "sort"                "sound"               "south"               "space"              
 [913] "speak"               "special"             "specialcal"          "spend"              
 [917] "spent"               "spoke"               "spree"               "stand"              
 [921] "start"               "statement"           "station"             "stay"               
 [925] "std"                 "step"                "still"               "stockport"          
 [929] "stone"               "stop"                "store"               "stori"              
 [933] "street"              "student"             "studi"               "stuff"              
 [937] "stupid"              "style"               "sub"                 "subscrib"           
 [941] "success"             "suck"                "suit"                "summer"             
 [945] "sun"                 "sunday"              "sunshin"             "sup"                
 [949] "support"             "suppos"              "sure"                "surf"               
 [953] "surpris"             "sweet"               "swing"               "system"             
 [957] "take"                "talk"                "tampa"               "tariff"             
 [961] "tcs"                 "tea"                 "teach"               "tear"               
 [965] "teas"                "tel"                 "tell"                "ten"                
 [969] "tenerif"             "term"                "test"                "text"               
 [973] "thank"               "thanx"               "that"                "thing"              
 [977] "think"               "thinkin"             "thk"                 "tho"                
 [981] "though"              "thought"             "throw"               "thru"               
 [985] "tht"                 "thur"                "tick"                "ticket"             
 [989] "til"                 "till"                "time"                "tire"               
 [993] "titl"                "tmr"                 "toclaim"             "today"              
 [997] "togeth"              "told"                "tomo"                "tomorrow"           
 [ reached getOption("max.print") -- omitted 139 entries ]
# save frequently-appearing terms to a character vector
sms_freq_words <- findFreqTerms(sms_dtm_train, 5)
str(sms_freq_words)
 chr [1:1139] "£wk" "€˜m" "€˜s" "abiola" "abl" "abt" "accept" "access" "account" "across" "act" ...
# create DTMs with only the frequent terms
sms_dtm_freq_train <- sms_dtm_train[ , sms_freq_words]
sms_dtm_freq_test <- sms_dtm_test[ , sms_freq_words]
# convert counts to a factor
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}
# apply() convert_counts() to columns of train/test data
sms_train <- apply(sms_dtm_freq_train, MARGIN = 2, convert_counts)
sms_test  <- apply(sms_dtm_freq_test, MARGIN = 2, convert_counts)

Step 3: Training a model on the data —-

library(e1071)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)

Step 4: Evaluating model performance —-

sms_test_pred <- predict(sms_classifier, sms_test)
head(sms_test_pred)
[1] ham  ham  ham  ham  spam ham 
Levels: ham spam
library(gmodels)
CrossTable(sms_test_pred, sms_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Col Total |
|-------------------------|

 
Total Observations in Table:  1390 

 
             | actual 
   predicted |       ham |      spam | Row Total | 
-------------|-----------|-----------|-----------|
         ham |      1201 |        30 |      1231 | 
             |     0.995 |     0.164 |           | 
-------------|-----------|-----------|-----------|
        spam |         6 |       153 |       159 | 
             |     0.005 |     0.836 |           | 
-------------|-----------|-----------|-----------|
Column Total |      1207 |       183 |      1390 | 
             |     0.868 |     0.132 |           | 
-------------|-----------|-----------|-----------|

 

Step 5: Improving model performance —-

sms_classifier2 <- naiveBayes(sms_train, sms_train_labels, laplace = 1)
sms_test_pred2 <- predict(sms_classifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_labels,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))

 
   Cell Contents
|-------------------------|
|                       N |
|           N / Col Total |
|-------------------------|

 
Total Observations in Table:  1390 

 
             | actual 
   predicted |       ham |      spam | Row Total | 
-------------|-----------|-----------|-----------|
         ham |      1202 |        28 |      1230 | 
             |     0.996 |     0.153 |           | 
-------------|-----------|-----------|-----------|
        spam |         5 |       155 |       160 | 
             |     0.004 |     0.847 |           | 
-------------|-----------|-----------|-----------|
Column Total |      1207 |       183 |      1390 | 
             |     0.868 |     0.132 |           | 
-------------|-----------|-----------|-----------|

 
LS0tCnRpdGxlOiAnQ2hhcHRlciA0OiBDbGFzc2lmaWNhdGlvbiB1c2luZyBOYWl2ZSBCYXllcycKb3V0cHV0OgogIHBkZl9kb2N1bWVudDogZGVmYXVsdAogIGh0bWxfbm90ZWJvb2s6IGRlZmF1bHQKLS0tCgpUaGlzIGlzIGFuIFtSIE1hcmtkb3duXShodHRwOi8vcm1hcmtkb3duLnJzdHVkaW8uY29tKSBOb3RlYm9vay4gV2hlbiB5b3UgZXhlY3V0ZSBjb2RlIHdpdGhpbiB0aGUgbm90ZWJvb2ssIHRoZSByZXN1bHRzIGFwcGVhciBiZW5lYXRoIHRoZSBjb2RlLiAKClRyeSBleGVjdXRpbmcgdGhpcyBjaHVuayBieSBjbGlja2luZyB0aGUgKlJ1biogYnV0dG9uIHdpdGhpbiB0aGUgY2h1bmsgb3IgYnkgcGxhY2luZyB5b3VyIGN1cnNvciBpbnNpZGUgaXQgYW5kIHByZXNzaW5nICpDdHJsK1NoaWZ0K0VudGVyKi4gCgpBZGQgYSBuZXcgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpJbnNlcnQgQ2h1bmsqIGJ1dHRvbiBvbiB0aGUgdG9vbGJhciBvciBieSBwcmVzc2luZyAqQ3RybCtBbHQrSSouCgpXaGVuIHlvdSBzYXZlIHRoZSBub3RlYm9vaywgYW4gSFRNTCBmaWxlIGNvbnRhaW5pbmcgdGhlIGNvZGUgYW5kIG91dHB1dCB3aWxsIGJlIHNhdmVkIGFsb25nc2lkZSBpdCAoY2xpY2sgdGhlICpQcmV2aWV3KiBidXR0b24gb3IgcHJlc3MgKkN0cmwrU2hpZnQrSyogdG8gcHJldmlldyB0aGUgSFRNTCBmaWxlKS4KCiMgKipFeGFtcGxlOiBGaWx0ZXJpbmcgc3BhbSBTTVMgbWVzc2FnZXMqKgoKIyMgU3RlcCAxOiBEb3dubG9hZCB0aGUgZGF0YQoKYGBge3J9ClVSTCA8LSAiaHR0cDovL3d3dy5zY2kuY3N1ZWFzdGJheS5lZHUvfmVzdWVzcy9jbGFzc2VzL1N0YXRpc3RpY3NfNjYyMC9QcmVzZW50YXRpb25zL21sNi9zbXNfc3BhbS5jc3YiCmRvd25sb2FkLmZpbGUoVVJMLCBkZXN0ZmlsZSA9ICIuL3Ntc19zcGFtLmNzdiIsIG1ldGhvZD0iY3VybCIpCmBgYAoKCiMjIFN0ZXAgMjogRXhwbG9yaW5nIGFuZCBwcmVwYXJpbmcgdGhlIGRhdGEgLS0tLSAKCmBgYHtyfQojIHJlYWQgdGhlIHNtcyBkYXRhIGludG8gdGhlIHNtcyBkYXRhIGZyYW1lCnNtc19yYXcgPC0gcmVhZC5jc3YoInNtc19zcGFtLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkKCiMgZXhhbWluZSB0aGUgc3RydWN0dXJlIG9mIHRoZSBzbXMgZGF0YQpzdHIoc21zX3JhdykKCmBgYApgYGB7cn0KIyBjb252ZXJ0IHNwYW0vaGFtIHRvIGZhY3Rvci4Kc21zX3JhdyR0eXBlIDwtIGZhY3RvcihzbXNfcmF3JHR5cGUpCgojIGV4YW1pbmUgdGhlIHR5cGUgdmFyaWFibGUgbW9yZSBjYXJlZnVsbHkKc3RyKHNtc19yYXckdHlwZSkKdGFibGUoc21zX3JhdyR0eXBlKQoKYGBgCmBgYHtyfQojIGJ1aWxkIGEgY29ycHVzIHVzaW5nIHRoZSB0ZXh0IG1pbmluZyAodG0pIHBhY2thZ2UKbGlicmFyeSh0bSkKc21zX2NvcnB1cyA8LSBWQ29ycHVzKFZlY3RvclNvdXJjZShzbXNfcmF3JHRleHQpKQoKIyBleGFtaW5lIHRoZSBzbXMgY29ycHVzCnByaW50KHNtc19jb3JwdXMpCmluc3BlY3Qoc21zX2NvcnB1c1sxOjJdKQpgYGAKYGBge3J9CmFzLmNoYXJhY3RlcihzbXNfY29ycHVzW1sxXV0pCmxhcHBseShzbXNfY29ycHVzWzE6Ml0sIGFzLmNoYXJhY3RlcikKYGBgCmBgYHtyfQojIGNsZWFuIHVwIHRoZSBjb3JwdXMgdXNpbmcgdG1fbWFwKCkKc21zX2NvcnB1c19jbGVhbiA8LSB0bV9tYXAoc21zX2NvcnB1cywgY29udGVudF90cmFuc2Zvcm1lcih0b2xvd2VyKSkKCiMgc2hvdyB0aGUgZGlmZmVyZW5jZSBiZXR3ZWVuIHNtc19jb3JwdXMgYW5kIGNvcnB1c19jbGVhbgphcy5jaGFyYWN0ZXIoc21zX2NvcnB1c1tbMV1dKQphcy5jaGFyYWN0ZXIoc21zX2NvcnB1c19jbGVhbltbMV1dKQpgYGAKYGBge3J9CnNtc19jb3JwdXNfY2xlYW4gPC0gdG1fbWFwKHNtc19jb3JwdXNfY2xlYW4sIHJlbW92ZU51bWJlcnMpICMgcmVtb3ZlIG51bWJlcnMKc21zX2NvcnB1c19jbGVhbiA8LSB0bV9tYXAoc21zX2NvcnB1c19jbGVhbiwgcmVtb3ZlV29yZHMsIHN0b3B3b3JkcygpKSAjIHJlbW92ZSBzdG9wIHdvcmRzCnNtc19jb3JwdXNfY2xlYW4gPC0gdG1fbWFwKHNtc19jb3JwdXNfY2xlYW4sIHJlbW92ZVB1bmN0dWF0aW9uKSAjIHJlbW92ZSBwdW5jdHVhdGlvbgpgYGAKYGBge3J9CiMgdGlwOiBjcmVhdGUgYSBjdXN0b20gZnVuY3Rpb24gdG8gcmVwbGFjZSAocmF0aGVyIHRoYW4gcmVtb3ZlKSBwdW5jdHVhdGlvbgpyZW1vdmVQdW5jdHVhdGlvbigiaGVsbG8uLi53b3JsZCIpCnJlcGxhY2VQdW5jdHVhdGlvbiA8LSBmdW5jdGlvbih4KSB7IGdzdWIoIltbOnB1bmN0Ol1dKyIsICIgIiwgeCkgfQpyZXBsYWNlUHVuY3R1YXRpb24oImhlbGxvLi4ud29ybGQiKQpgYGAKYGBge3J9CiMgaWxsdXN0cmF0aW9uIG9mIHdvcmQgc3RlbW1pbmcKbGlicmFyeShTbm93YmFsbEMpCndvcmRTdGVtKGMoImxlYXJuIiwgImxlYXJuZWQiLCAibGVhcm5pbmciLCAibGVhcm5zIikpCgpzbXNfY29ycHVzX2NsZWFuIDwtIHRtX21hcChzbXNfY29ycHVzX2NsZWFuLCBzdGVtRG9jdW1lbnQpCgpzbXNfY29ycHVzX2NsZWFuIDwtIHRtX21hcChzbXNfY29ycHVzX2NsZWFuLCBzdHJpcFdoaXRlc3BhY2UpICMgZWxpbWluYXRlIHVubmVlZGVkIHdoaXRlc3BhY2UKCiMgZXhhbWluZSB0aGUgZmluYWwgY2xlYW4gY29ycHVzCmxhcHBseShzbXNfY29ycHVzWzE6M10sIGFzLmNoYXJhY3RlcikKbGFwcGx5KHNtc19jb3JwdXNfY2xlYW5bMTozXSwgYXMuY2hhcmFjdGVyKQpgYGAKYGBge3J9CiMgY3JlYXRlIGEgZG9jdW1lbnQtdGVybSBzcGFyc2UgbWF0cml4CnNtc19kdG0gPC0gRG9jdW1lbnRUZXJtTWF0cml4KHNtc19jb3JwdXNfY2xlYW4pCgojIGFsdGVybmF0aXZlIHNvbHV0aW9uOiBjcmVhdGUgYSBkb2N1bWVudC10ZXJtIHNwYXJzZSBtYXRyaXggZGlyZWN0bHkgZnJvbSB0aGUgU01TIGNvcnB1cwpzbXNfZHRtMiA8LSBEb2N1bWVudFRlcm1NYXRyaXgoc21zX2NvcnB1cywgY29udHJvbCA9IGxpc3QoCiAgdG9sb3dlciA9IFRSVUUsCiAgcmVtb3ZlTnVtYmVycyA9IFRSVUUsCiAgc3RvcHdvcmRzID0gVFJVRSwKICByZW1vdmVQdW5jdHVhdGlvbiA9IFRSVUUsCiAgc3RlbW1pbmcgPSBUUlVFCikpCgojIGFsdGVybmF0aXZlIHNvbHV0aW9uOiB1c2luZyBjdXN0b20gc3RvcCB3b3JkcyBmdW5jdGlvbiBlbnN1cmVzIGlkZW50aWNhbCByZXN1bHQKc21zX2R0bTMgPC0gRG9jdW1lbnRUZXJtTWF0cml4KHNtc19jb3JwdXMsIGNvbnRyb2wgPSBsaXN0KAogIHRvbG93ZXIgPSBUUlVFLAogIHJlbW92ZU51bWJlcnMgPSBUUlVFLAogIHN0b3B3b3JkcyA9IGZ1bmN0aW9uKHgpIHsgcmVtb3ZlV29yZHMoeCwgc3RvcHdvcmRzKCkpIH0sCiAgcmVtb3ZlUHVuY3R1YXRpb24gPSBUUlVFLAogIHN0ZW1taW5nID0gVFJVRQopKQoKIyBjb21wYXJlIHRoZSByZXN1bHQKc21zX2R0bQpzbXNfZHRtMgpzbXNfZHRtMwpgYGAKYGBge3J9CiMgY3JlYXRpbmcgdHJhaW5pbmcgYW5kIHRlc3QgZGF0YXNldHMKc21zX2R0bV90cmFpbiA8LSBzbXNfZHRtWzE6NDE2OSwgXQpzbXNfZHRtX3Rlc3QgIDwtIHNtc19kdG1bNDE3MDo1NTU5LCBdCgojIGFsc28gc2F2ZSB0aGUgbGFiZWxzCnNtc190cmFpbl9sYWJlbHMgPC0gc21zX3Jhd1sxOjQxNjksIF0kdHlwZQpzbXNfdGVzdF9sYWJlbHMgIDwtIHNtc19yYXdbNDE3MDo1NTU5LCBdJHR5cGUKCiMgY2hlY2sgdGhhdCB0aGUgcHJvcG9ydGlvbiBvZiBzcGFtIGlzIHNpbWlsYXIKcHJvcC50YWJsZSh0YWJsZShzbXNfdHJhaW5fbGFiZWxzKSkKcHJvcC50YWJsZSh0YWJsZShzbXNfdGVzdF9sYWJlbHMpKQpgYGAKYGBge3J9CiMgd29yZCBjbG91ZCB2aXN1YWxpemF0aW9uCmxpYnJhcnkod29yZGNsb3VkKQp3b3JkY2xvdWQoc21zX2NvcnB1c19jbGVhbiwgbWluLmZyZXEgPSA1MCwgcmFuZG9tLm9yZGVyID0gRkFMU0UpCmBgYApgYGB7cn0KIyBzdWJzZXQgdGhlIHRyYWluaW5nIGRhdGEgaW50byBzcGFtIGFuZCBoYW0gZ3JvdXBzCnNwYW0gPC0gc3Vic2V0KHNtc19yYXcsIHR5cGUgPT0gInNwYW0iKQpoYW0gIDwtIHN1YnNldChzbXNfcmF3LCB0eXBlID09ICJoYW0iKQoKd29yZGNsb3VkKHNwYW0kdGV4dCwgbWF4LndvcmRzID0gNDAsIHNjYWxlID0gYygzLCAwLjUpKQp3b3JkY2xvdWQoaGFtJHRleHQsIG1heC53b3JkcyA9IDQwLCBzY2FsZSA9IGMoMywgMC41KSkKYGBgCmBgYHtyfQpzbXNfZHRtX2ZyZXFfdHJhaW4gPC0gcmVtb3ZlU3BhcnNlVGVybXMoc21zX2R0bV90cmFpbiwgMC45OTkpCnNtc19kdG1fZnJlcV90cmFpbgoKIyBpbmRpY2F0b3IgZmVhdHVyZXMgZm9yIGZyZXF1ZW50IHdvcmRzCmZpbmRGcmVxVGVybXMoc21zX2R0bV90cmFpbiwgNSkKCiMgc2F2ZSBmcmVxdWVudGx5LWFwcGVhcmluZyB0ZXJtcyB0byBhIGNoYXJhY3RlciB2ZWN0b3IKc21zX2ZyZXFfd29yZHMgPC0gZmluZEZyZXFUZXJtcyhzbXNfZHRtX3RyYWluLCA1KQpzdHIoc21zX2ZyZXFfd29yZHMpCgojIGNyZWF0ZSBEVE1zIHdpdGggb25seSB0aGUgZnJlcXVlbnQgdGVybXMKc21zX2R0bV9mcmVxX3RyYWluIDwtIHNtc19kdG1fdHJhaW5bICwgc21zX2ZyZXFfd29yZHNdCnNtc19kdG1fZnJlcV90ZXN0IDwtIHNtc19kdG1fdGVzdFsgLCBzbXNfZnJlcV93b3Jkc10KCiMgY29udmVydCBjb3VudHMgdG8gYSBmYWN0b3IKY29udmVydF9jb3VudHMgPC0gZnVuY3Rpb24oeCkgewogIHggPC0gaWZlbHNlKHggPiAwLCAiWWVzIiwgIk5vIikKfQoKIyBhcHBseSgpIGNvbnZlcnRfY291bnRzKCkgdG8gY29sdW1ucyBvZiB0cmFpbi90ZXN0IGRhdGEKc21zX3RyYWluIDwtIGFwcGx5KHNtc19kdG1fZnJlcV90cmFpbiwgTUFSR0lOID0gMiwgY29udmVydF9jb3VudHMpCnNtc190ZXN0ICA8LSBhcHBseShzbXNfZHRtX2ZyZXFfdGVzdCwgTUFSR0lOID0gMiwgY29udmVydF9jb3VudHMpCmBgYAoKIyMgU3RlcCAzOiBUcmFpbmluZyBhIG1vZGVsIG9uIHRoZSBkYXRhIC0tLS0KCmBgYHtyfQpsaWJyYXJ5KGUxMDcxKQpzbXNfY2xhc3NpZmllciA8LSBuYWl2ZUJheWVzKHNtc190cmFpbiwgc21zX3RyYWluX2xhYmVscykKCmBgYAoKIyMgU3RlcCA0OiBFdmFsdWF0aW5nIG1vZGVsIHBlcmZvcm1hbmNlIC0tLS0KCmBgYHtyfQpzbXNfdGVzdF9wcmVkIDwtIHByZWRpY3Qoc21zX2NsYXNzaWZpZXIsIHNtc190ZXN0KQoKaGVhZChzbXNfdGVzdF9wcmVkKQoKbGlicmFyeShnbW9kZWxzKQpDcm9zc1RhYmxlKHNtc190ZXN0X3ByZWQsIHNtc190ZXN0X2xhYmVscywKICAgICAgICAgICBwcm9wLmNoaXNxID0gRkFMU0UsIHByb3AudCA9IEZBTFNFLCBwcm9wLnIgPSBGQUxTRSwKICAgICAgICAgICBkbm4gPSBjKCdwcmVkaWN0ZWQnLCAnYWN0dWFsJykpCmBgYAoKIyMgU3RlcCA1OiBJbXByb3ZpbmcgbW9kZWwgcGVyZm9ybWFuY2UgLS0tLQoKYGBge3J9CnNtc19jbGFzc2lmaWVyMiA8LSBuYWl2ZUJheWVzKHNtc190cmFpbiwgc21zX3RyYWluX2xhYmVscywgbGFwbGFjZSA9IDEpCnNtc190ZXN0X3ByZWQyIDwtIHByZWRpY3Qoc21zX2NsYXNzaWZpZXIyLCBzbXNfdGVzdCkKQ3Jvc3NUYWJsZShzbXNfdGVzdF9wcmVkMiwgc21zX3Rlc3RfbGFiZWxzLAogICAgICAgICAgIHByb3AuY2hpc3EgPSBGQUxTRSwgcHJvcC50ID0gRkFMU0UsIHByb3AuciA9IEZBTFNFLAogICAgICAgICAgIGRubiA9IGMoJ3ByZWRpY3RlZCcsICdhY3R1YWwnKSkKYGBgCgoKCg==