Read A hands-on intuitive approach to Deep Learning Methods for Text Data — Word2Vec, GloVe and FastText
If you are interested in learning more about applications of Deep Learning to NLP applications. Intel AI Developer Program NLP
library(text2vec)
text8_file = "text8"
if (!file.exists(text8_file)) {
download.file("http://mattmahoney.net/dc/text8.zip", "text8.zip")
unzip ("text8.zip")
}
wiki = readLines(text8_file, n = 1, warn = FALSE)
# Create iterator over tokens
tokens = space_tokenizer(wiki)
# Create vocabulary. Terms will be unigrams (simple words).
it = itoken(tokens, progressbar = TRUE)
vocab = create_vocabulary(it)
vocab = prune_vocabulary(vocab, term_count_min = 5L)
# Use our filtered vocabulary
vectorizer = vocab_vectorizer(vocab)
# use window of 5 for context words
tcm = create_tcm(it, vectorizer, skip_grams_window = 5L)
Fit the GloVe neural network encoder. Dimension reductions.
glove = GlobalVectors$new(rank = 50, x_max = 20)
wv_main = glove$fit_transform(tcm, n_iter = 10, convergence_tol = 0.01, n_threads = 8)
dim(wv_main)
wv_context = glove$components
dim(wv_context)
word_vectors = wv_main + t(wv_context)
We can find the closest word vectors for our paris - france + germany example:
berlin = word_vectors["paris", , drop = FALSE] -
word_vectors["france", , drop = FALSE] +
word_vectors["germany", , drop = FALSE]
cos_sim = sim2(x = word_vectors, y = berlin, method = "cosine", norm = "l2")
head(sort(cos_sim[,1], decreasing = TRUE), 5)
LS0tDQp0aXRsZTogInRleHQydmVjIC0gR2xvVmUiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpSZWFkIFtBIGhhbmRzLW9uIGludHVpdGl2ZSBhcHByb2FjaCB0byBEZWVwIExlYXJuaW5nIE1ldGhvZHMgZm9yIFRleHQgRGF0YSDigJQgV29yZDJWZWMsIEdsb1ZlIGFuZCBGYXN0VGV4dF0oaHR0cHM6Ly90b3dhcmRzZGF0YXNjaWVuY2UuY29tL3VuZGVyc3RhbmRpbmctZmVhdHVyZS1lbmdpbmVlcmluZy1wYXJ0LTQtZGVlcC1sZWFybmluZy1tZXRob2RzLWZvci10ZXh0LWRhdGEtOTZjNDQzNzBiYmZhKQ0KDQpJZiB5b3UgYXJlIGludGVyZXN0ZWQgaW4gbGVhcm5pbmcgbW9yZSBhYm91dCBhcHBsaWNhdGlvbnMgb2YgRGVlcCBMZWFybmluZyB0byBOTFAgYXBwbGljYXRpb25zLiAgW0ludGVsIEFJIERldmVsb3BlciBQcm9ncmFtIE5MUF0oaHR0cHM6Ly9zb2Z0d2FyZS5pbnRlbC5jb20vZW4tdXMvYWkvY291cnNlcy9uYXR1cmFsLWxhbmd1YWdlLXByb2Nlc3NpbmcpDQoNCmBgYHtyfQ0KbGlicmFyeSh0ZXh0MnZlYykNCg0KdGV4dDhfZmlsZSA9ICJ0ZXh0OCINCg0KaWYgKCFmaWxlLmV4aXN0cyh0ZXh0OF9maWxlKSkgew0KICBkb3dubG9hZC5maWxlKCJodHRwOi8vbWF0dG1haG9uZXkubmV0L2RjL3RleHQ4LnppcCIsICJ0ZXh0OC56aXAiKQ0KICB1bnppcCAoInRleHQ4LnppcCIpDQp9DQoNCndpa2kgPSByZWFkTGluZXModGV4dDhfZmlsZSwgbiA9IDEsIHdhcm4gPSBGQUxTRSkNCmBgYA0KDQpgYGB7cn0NCiMgQ3JlYXRlIGl0ZXJhdG9yIG92ZXIgdG9rZW5zDQp0b2tlbnMgPSBzcGFjZV90b2tlbml6ZXIod2lraSkNCiMgQ3JlYXRlIHZvY2FidWxhcnkuIFRlcm1zIHdpbGwgYmUgdW5pZ3JhbXMgKHNpbXBsZSB3b3JkcykuDQppdCA9IGl0b2tlbih0b2tlbnMsIHByb2dyZXNzYmFyID0gVFJVRSkNCnZvY2FiID0gY3JlYXRlX3ZvY2FidWxhcnkoaXQpDQpgYGANCg0KYGBge3J9DQp2b2NhYiA9IHBydW5lX3ZvY2FidWxhcnkodm9jYWIsIHRlcm1fY291bnRfbWluID0gNUwpDQpgYGANCg0KYGBge3J9DQojIFVzZSBvdXIgZmlsdGVyZWQgdm9jYWJ1bGFyeQ0KdmVjdG9yaXplciA9IHZvY2FiX3ZlY3Rvcml6ZXIodm9jYWIpDQojIHVzZSB3aW5kb3cgb2YgNSBmb3IgY29udGV4dCB3b3Jkcw0KdGNtID0gY3JlYXRlX3RjbShpdCwgdmVjdG9yaXplciwgc2tpcF9ncmFtc193aW5kb3cgPSA1TCkNCmBgYA0KDQpGaXQgdGhlIEdsb1ZlIG5ldXJhbCBuZXR3b3JrIGVuY29kZXIuIERpbWVuc2lvbiByZWR1Y3Rpb25zLg0KDQpgYGB7cn0NCmdsb3ZlID0gR2xvYmFsVmVjdG9ycyRuZXcocmFuayA9IDUwLCB4X21heCA9IDIwKQ0Kd3ZfbWFpbiA9IGdsb3ZlJGZpdF90cmFuc2Zvcm0odGNtLCBuX2l0ZXIgPSAxMCwgY29udmVyZ2VuY2VfdG9sID0gMC4wMSwgbl90aHJlYWRzID0gOCkNCmBgYA0KDQoNCmBgYHtyfQ0KZGltKHd2X21haW4pDQpgYGANCg0KYGBge3J9DQp3dl9jb250ZXh0ID0gZ2xvdmUkY29tcG9uZW50cw0KZGltKHd2X2NvbnRleHQpDQpgYGANCg0KYGBge3J9DQp3b3JkX3ZlY3RvcnMgPSB3dl9tYWluICsgdCh3dl9jb250ZXh0KQ0KYGBgDQoNCldlIGNhbiBmaW5kIHRoZSBjbG9zZXN0IHdvcmQgdmVjdG9ycyBmb3Igb3VyIHBhcmlzIC0gZnJhbmNlICsgZ2VybWFueSBleGFtcGxlOg0KDQpgYGB7cn0NCmJlcmxpbiA9IHdvcmRfdmVjdG9yc1sicGFyaXMiLCAsIGRyb3AgPSBGQUxTRV0gLSANCiAgd29yZF92ZWN0b3JzWyJmcmFuY2UiLCAsIGRyb3AgPSBGQUxTRV0gKyANCiAgd29yZF92ZWN0b3JzWyJnZXJtYW55IiwgLCBkcm9wID0gRkFMU0VdDQpjb3Nfc2ltID0gc2ltMih4ID0gd29yZF92ZWN0b3JzLCB5ID0gYmVybGluLCBtZXRob2QgPSAiY29zaW5lIiwgbm9ybSA9ICJsMiIpDQpoZWFkKHNvcnQoY29zX3NpbVssMV0sIGRlY3JlYXNpbmcgPSBUUlVFKSwgNSkNCmBgYA0KDQoNCg0K