library(dplyr)
<- PubmedMTK::pmtk_search_pubmed(search_term = 'medical marijuana',
pmids fields = c('TIAB','MH'),
verbose = F)
<- PubmedMTK::pmtk_get_records2(pmids = pmids$pmid,
abstracts0 cores = 6,
ncbi_key = key) |>
::rbindlist() |>
data.tablefilter(!is.na(abstract)) |>
mutate(abstract = tolower(abstract))
A uniform approach to global word embeddings in R.
1 Some text data via PubMed
2 Data structures & parameters
2.1 Tokenization
<- abstracts0 |>
toks rename(doc_id = pmid, text = abstract) |>
::tif2token() text2df
<- text2df::tok2collocations(toks, remove_stops = T)
mwes <- toks |> text2df::token2mwe(mwes) toks0
2.2 TIF
<- data.frame(doc_id = abstracts0$pmid,
ntif text = unlist(lapply(toks0, paste0, collapse = ' ')))
2.3 Model parameters
<- 50
dims <- 5
window <- 5 min_count
3 GloVe embeddings
<- text2vec::itoken(toks0, progressbar = FALSE)
it <- text2vec::create_vocabulary(it) |>
vocab ::prune_vocabulary(term_count_min = min_count)
text2vec
<- text2vec::vocab_vectorizer(vocab)
vectorizer <- text2vec::create_tcm(it, vectorizer, skip_grams_window = window)
tcm
<- text2vec::GlobalVectors$new(rank = dims, x_max = 10)
glove <- glove$fit_transform(tcm,
wv_main n_iter = 10,
convergence_tol = 0.01,
n_threads = 6)
<- glove$components
wv_context <- wv_main + t(wv_context) glove_embeddings
4 word2vec/doc2vec embeddings
## d2v <- list(dm = 'PV-DM', bow = 'PV-DBOW')
<- doc2vec::paragraph2vec(x = ntif,
model.d2v type = "PV-DM",
dim = dims,
iter = 20,
min_count = min_count,
lr = 0.05,
threads = 5)
<- as.matrix(model.d2v, which = "words") d2v_embeddings
5 fastText embeddings
## devtools::install_github("pommedeterresautee/fastrtext")
<- tempfile()
tmp_file_txt <- tempfile()
tmp_file_model writeLines(text = ntif$text, con = tmp_file_txt)
::execute(commands = c("skipgram",
fastrtext"-input", tmp_file_txt,
"-output", tmp_file_model,
"-dim", gsub('^.*\\.', '', dims),
"-ws", window,
"-minCount", min_count,
"-verbose", 1))
<- fastrtext::load_model(tmp_file_model)
fast.model <- fastrtext::get_dictionary(fast.model)
fast.dict <- fastrtext::get_word_vectors(fast.model, fast.dict) fast_embeddings
6 Pretrained GloVe embeddings
setwd(locald)
.6B.50d <- data.table::fread('glove.6B.50d.txt')
glove<- as.matrix(glove.6B.50d[, 2:51])
glove_pretrained rownames(glove_pretrained) <- glove.6B.50d$V1
<- subset(glove_pretrained,
glove_pretrained rownames(glove_pretrained) %in% fast.dict)
7 Semantics & cosine similarity
7.1 Collate models
Note that the pretrained GloVe model does not include multi-word expressions.
<- list('glove' = glove_embeddings,
models 'word2vec' = d2v_embeddings,
'fastText' = fast_embeddings,
'glove_pretrained' = glove_pretrained)
lapply(models, dim)
$glove
[1] 5690 50
$word2vec
[1] 5692 50
$fastText
[1] 5691 50
$glove_pretrained
[1] 5062 50
7.2 Cosine similarity
<- function (embeddings,
quick_cosine
target, n = 9) {
if(is.character(target)){
<- embeddings[target, , drop = FALSE]} else{t0 <- target}
t0
<- text2vec::sim2(x = embeddings,
cos_sim y = t0,
method = "cosine",
norm = "l2")
<- head(sort(cos_sim[,1], decreasing = TRUE), n+1)
x1
data.frame(rank = 1:(n+1),
term1 = rownames(t0),
term2 = names(x1),
value = round(x1, 3),
row.names = NULL)
}
lapply(models, quick_cosine, target = 'legalization') |> #'legality'
::rbindlist(idcol = 'model') |>
data.tableselect(-term1, -value) |>
::spread(model, term2) |>
tidyr::kable() knitr
rank | fastText | glove | glove_pretrained | word2vec |
---|---|---|---|---|
1 | legalization | legalization | legalization | legalization |
2 | decriminalization | marijuana | legalizing | legalisation |
3 | pre-legalization | recreational | legalize | legalizing |
4 | liberalization | medical | decriminalization | passage |
5 | post-legalization | use | legalisation | use |
6 | commercialization | cannabis | legalized | decriminalization |
7 | medicalization | its | proponents | enactment |
8 | legalisation | state | advocates | implementation |
9 | legalizing | medicinal | decriminalisation | legalize |
10 | normalization | before | abstinence | laws |