rnabioco · jayhesselberth · Feb 9, 2025 · Feb 2, 2025 · Feb 2, 2025 · Feb 2, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -33,6 +33,8 @@ Suggests:
     RSQLite (>= 1.0.0),
     scales,
     testthat (>= 3.0.0),
+    textrecipes,
+    tidymodels,
     tidyverse
 VignetteBuilder:
     knitr

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # nihexporter (development version)
 
+* New `abstract_words` table containing tokenized words for project abstracts.
+
 * New `projects_min` table, which contains a minimal subset of projects data from 2006-2024,
   with both direct and indirect costs (2006 was the first year IC amounts were published).
 

diff --git a/R/data.R b/R/data.R
@@ -64,3 +64,10 @@
 #'
 #' @source Computed from \link{projects} table.
 "project_io"
+
+#' Tokenized words from abstracts.
+#'
+#' @format A tibble with five variables: `activity`, `fiscal_year`, `institute`, `word`, `n`.
+#'
+#' @source \url{https://reporter.nih.gov/exporter/abstracts}
+"abstract_words"
diff --git a/README.Rmd b/README.Rmd
@@ -49,7 +49,7 @@ pak::pak("rnabioco/nihexporter")
 
 * `project_io`: pre-computed `n.pubs`, `n.patents` and `project.cost` for each `project.num`.
 
-**Note:** [Abstracts](https://reporter.nih.gov/exporter/abstracts) from NIH EXPORTER are not provided as they significantly increase the size of the package.
+* `abstract_words`: tokenized words from [grant abstracts](https://reporter.nih.gov/exporter/abstracts).
 
 ## Functions
 

diff --git a/README.md b/README.md
@@ -52,9 +52,8 @@ time to download and install. ⚠️
 - `project_io`: pre-computed `n.pubs`, `n.patents` and `project.cost`
   for each `project.num`.
 
-**Note:** [Abstracts](https://reporter.nih.gov/exporter/abstracts) from
-NIH EXPORTER are not provided as they significantly increase the size of
-the package.
+- `abstract_words`: tokenized words from [grant
+  abstracts](https://reporter.nih.gov/exporter/abstracts).
 
 ## Functions
 

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -31,3 +31,4 @@ reference:
   - publinks
   - patents
   - clinical_studies
+  - abstract_words
diff --git a/data-raw/abstracts.R b/data-raw/abstracts.R
@@ -0,0 +1,69 @@
+# parse and tokenize abstracts
+
+library(tidyverse)
+library(tidytext)
+library(here)
+
+source("data-raw/common.R")
+
+path <- here("data-raw/downloads/abstracts")
+
+col_types <- cols_only(
+  APPLICATION_ID = col_double(),
+  ABSTRACT_TEXT = col_character(),
+)
+
+abstracts_raw_tbl <-
+  load_tbl(path, col_types) |>
+    left_join(projects, by = "application_id") |>
+    select(activity, fiscal_year, institute, abstract_text) |>
+    # extramural only
+    filter(!str_detect(activity, "^Z")) |>
+    na.omit() |>
+    unique()
+
+data(stop_words)
+custom_stop_words <- tibble(
+  word = c(
+    # generic to abstracts
+    "research",
+    "specific",
+    "studies",
+    "aim",
+    # meaningless annotations
+    "description",
+    "unreadable"
+  )
+)
+
+tokenize_words <- function(df) {
+  unnest_tokens(df, input = abstract_text, output = word) |>
+    # remove words that are numbers
+    filter(!str_detect(word, "^[0-9]*$")) |>
+    anti_join(stop_words) |>
+    anti_join(custom_stop_words) |>
+    count(activity, fiscal_year, institute, word, sort = TRUE) |>
+    filter(n >= 10)
+}
+
+df_splits <- group_by(abstracts_raw_tbl, fiscal_year, institute) |>
+  group_split()
+
+# df_splits <- df_splits[1:10]
+
+library(furrr)
+library(progressr)
+plan(multisession, workers = 12)
+with_progress({
+  p <- progressor(steps = length(df_splits))
+
+  abstract_words <- future_map_dfr(
+    df_splits,
+    ~{
+      p()
+      tokenize_words(.x)
+    }
+  )
+})
+
+usethis::use_data(abstract_words, compress = "xz", overwrite = TRUE)
diff --git a/data/abstract_words.rda b/data/abstract_words.rda
diff --git a/man/abstract_words.Rd b/man/abstract_words.Rd