global word embeddings in R

A uniform approach
nlp
lexical semantics
Published

July 12, 2022

A uniform approach to global word embeddings in R.


1 Some text data via PubMed

library(dplyr)
pmids <- PubmedMTK::pmtk_search_pubmed(search_term = 'medical marijuana', 
                                       fields = c('TIAB','MH'),
                                       verbose = F)

abstracts0 <- PubmedMTK::pmtk_get_records2(pmids = pmids$pmid, 
                                          cores = 6, 
                                          ncbi_key = key) |> 
  data.table::rbindlist() |> 
  filter(!is.na(abstract)) |>
  mutate(abstract = tolower(abstract))

2 Data structures & parameters

2.1 Tokenization

toks <- abstracts0 |> 
  rename(doc_id = pmid, text = abstract) |>
  text2df::tif2token()
mwes <- text2df::tok2collocations(toks, remove_stops = T)
toks0 <- toks |> text2df::token2mwe(mwes)

2.2 TIF

ntif <- data.frame(doc_id = abstracts0$pmid,
                   text = unlist(lapply(toks0, paste0, collapse = ' ')))

2.3 Model parameters

dims <- 50
window <- 5
min_count <- 5

3 GloVe embeddings

it <- text2vec::itoken(toks0, progressbar = FALSE)
vocab <- text2vec::create_vocabulary(it) |>
  text2vec::prune_vocabulary(term_count_min = min_count)

vectorizer <- text2vec::vocab_vectorizer(vocab)
tcm <- text2vec::create_tcm(it, vectorizer, skip_grams_window = window)

glove <- text2vec::GlobalVectors$new(rank = dims, x_max = 10)
wv_main <- glove$fit_transform(tcm, 
                               n_iter = 10, 
                               convergence_tol = 0.01, 
                               n_threads = 6)
wv_context <- glove$components
glove_embeddings <- wv_main + t(wv_context)

4 word2vec/doc2vec embeddings

## d2v <- list(dm = 'PV-DM', bow = 'PV-DBOW')
model.d2v <- doc2vec::paragraph2vec(x = ntif, 
                                    type = "PV-DM", 
                                    dim = dims, 
                                    iter = 20,
                                    min_count = min_count, 
                                    lr = 0.05, 
                                    threads = 5)

d2v_embeddings <- as.matrix(model.d2v, which = "words")

5 fastText embeddings

## devtools::install_github("pommedeterresautee/fastrtext") 
tmp_file_txt <- tempfile()
tmp_file_model <- tempfile()
writeLines(text = ntif$text, con = tmp_file_txt)

fastrtext::execute(commands = c("skipgram",
                                "-input", tmp_file_txt, 
                                "-output", tmp_file_model, 
                                "-dim", gsub('^.*\\.', '', dims),
                                "-ws", window, 
                                "-minCount", min_count,
                                "-verbose", 1))

fast.model <- fastrtext::load_model(tmp_file_model)
fast.dict <- fastrtext::get_dictionary(fast.model)
fast_embeddings <- fastrtext::get_word_vectors(fast.model, fast.dict)

6 Pretrained GloVe embeddings

setwd(locald)
glove.6B.50d <- data.table::fread('glove.6B.50d.txt')
glove_pretrained <- as.matrix(glove.6B.50d[, 2:51])
rownames(glove_pretrained) <- glove.6B.50d$V1
glove_pretrained <- subset(glove_pretrained, 
                           rownames(glove_pretrained) %in% fast.dict)

7 Semantics & cosine similarity

7.1 Collate models

Note that the pretrained GloVe model does not include multi-word expressions.

models <- list('glove' = glove_embeddings,
               'word2vec' = d2v_embeddings,
               'fastText' = fast_embeddings,
               'glove_pretrained' = glove_pretrained)

lapply(models, dim)
$glove
[1] 5690   50

$word2vec
[1] 5692   50

$fastText
[1] 5691   50

$glove_pretrained
[1] 5062   50

7.2 Cosine similarity

quick_cosine <- function (embeddings,
                          target, 
                          n = 9) {
  
  if(is.character(target)){
    t0 <- embeddings[target, , drop = FALSE]} else{t0 <- target}

  cos_sim <- text2vec::sim2(x = embeddings,
                            y = t0,
                            method = "cosine",
                            norm = "l2")

  x1 <- head(sort(cos_sim[,1], decreasing = TRUE), n+1)

  data.frame(rank = 1:(n+1),
             term1 = rownames(t0),
             term2 = names(x1),
             value = round(x1, 3),
             row.names = NULL)
}
lapply(models, quick_cosine, target = 'legalization') |> #'legality'
  data.table::rbindlist(idcol = 'model') |>
  select(-term1, -value) |>
  tidyr::spread(model, term2) |>
  knitr::kable()
rank fastText glove glove_pretrained word2vec
1 legalization legalization legalization legalization
2 decriminalization marijuana legalizing legalisation
3 pre-legalization recreational legalize legalizing
4 liberalization medical decriminalization passage
5 post-legalization use legalisation use
6 commercialization cannabis legalized decriminalization
7 medicalization its proponents enactment
8 legalisation state advocates implementation
9 legalizing medicinal decriminalisation legalize
10 normalization before abstinence laws