diff --git a/DESCRIPTION b/DESCRIPTION index a2df5ce..a775d85 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,6 +33,8 @@ Suggests: RSQLite (>= 1.0.0), scales, testthat (>= 3.0.0), + textrecipes, + tidymodels, tidyverse VignetteBuilder: knitr diff --git a/NEWS.md b/NEWS.md index 8e114bc..668589b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # nihexporter (development version) +* New `abstract_words` table containing tokenized words for project abstracts. + * New `projects_min` table, which contains a minimal subset of projects data from 2006-2024, with both direct and indirect costs (2006 was the first year IC amounts were published). diff --git a/R/data.R b/R/data.R index 3c25af7..008a0d7 100644 --- a/R/data.R +++ b/R/data.R @@ -64,3 +64,10 @@ #' #' @source Computed from \link{projects} table. "project_io" + +#' Tokenized words from abstracts. +#' +#' @format A tibble with five variables: `activity`, `fiscal_year`, `institute`, `word`, `n`. +#' +#' @source \url{https://reporter.nih.gov/exporter/abstracts} +"abstract_words" diff --git a/README.Rmd b/README.Rmd index 1ee2376..74cfac2 100644 --- a/README.Rmd +++ b/README.Rmd @@ -49,7 +49,7 @@ pak::pak("rnabioco/nihexporter") * `project_io`: pre-computed `n.pubs`, `n.patents` and `project.cost` for each `project.num`. -**Note:** [Abstracts](https://reporter.nih.gov/exporter/abstracts) from NIH EXPORTER are not provided as they significantly increase the size of the package. +* `abstract_words`: tokenized words from [grant abstracts](https://reporter.nih.gov/exporter/abstracts). ## Functions diff --git a/README.md b/README.md index 4187a03..fac4a54 100644 --- a/README.md +++ b/README.md @@ -52,9 +52,8 @@ time to download and install. ⚠️ - `project_io`: pre-computed `n.pubs`, `n.patents` and `project.cost` for each `project.num`. -**Note:** [Abstracts](https://reporter.nih.gov/exporter/abstracts) from -NIH EXPORTER are not provided as they significantly increase the size of -the package. +- `abstract_words`: tokenized words from [grant + abstracts](https://reporter.nih.gov/exporter/abstracts). ## Functions diff --git a/_pkgdown.yml b/_pkgdown.yml index 83f2a17..abcc4e5 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -31,3 +31,4 @@ reference: - publinks - patents - clinical_studies + - abstract_words diff --git a/data-raw/abstracts.R b/data-raw/abstracts.R new file mode 100644 index 0000000..2341ec1 --- /dev/null +++ b/data-raw/abstracts.R @@ -0,0 +1,69 @@ +# parse and tokenize abstracts + +library(tidyverse) +library(tidytext) +library(here) + +source("data-raw/common.R") + +path <- here("data-raw/downloads/abstracts") + +col_types <- cols_only( + APPLICATION_ID = col_double(), + ABSTRACT_TEXT = col_character(), +) + +abstracts_raw_tbl <- + load_tbl(path, col_types) |> + left_join(projects, by = "application_id") |> + select(activity, fiscal_year, institute, abstract_text) |> + # extramural only + filter(!str_detect(activity, "^Z")) |> + na.omit() |> + unique() + +data(stop_words) +custom_stop_words <- tibble( + word = c( + # generic to abstracts + "research", + "specific", + "studies", + "aim", + # meaningless annotations + "description", + "unreadable" + ) +) + +tokenize_words <- function(df) { + unnest_tokens(df, input = abstract_text, output = word) |> + # remove words that are numbers + filter(!str_detect(word, "^[0-9]*$")) |> + anti_join(stop_words) |> + anti_join(custom_stop_words) |> + count(activity, fiscal_year, institute, word, sort = TRUE) |> + filter(n >= 10) +} + +df_splits <- group_by(abstracts_raw_tbl, fiscal_year, institute) |> + group_split() + +# df_splits <- df_splits[1:10] + +library(furrr) +library(progressr) +plan(multisession, workers = 12) +with_progress({ + p <- progressor(steps = length(df_splits)) + + abstract_words <- future_map_dfr( + df_splits, + ~{ + p() + tokenize_words(.x) + } + ) +}) + +usethis::use_data(abstract_words, compress = "xz", overwrite = TRUE) diff --git a/data/abstract_words.rda b/data/abstract_words.rda new file mode 100644 index 0000000..02703ab Binary files /dev/null and b/data/abstract_words.rda differ diff --git a/man/abstract_words.Rd b/man/abstract_words.Rd new file mode 100644 index 0000000..9036fbd --- /dev/null +++ b/man/abstract_words.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{abstract_words} +\alias{abstract_words} +\title{Tokenized words from abstracts.} +\format{ +A tibble with five variables: \code{activity}, \code{fiscal_year}, \code{institute}, \code{word}, \code{n}. +} +\source{ +\url{https://reporter.nih.gov/exporter/abstracts} +} +\usage{ +abstract_words +} +\description{ +Tokenized words from abstracts. +} +\keyword{datasets}