diff --git a/.github/workflows/document-and-deploy.yml b/.github/workflows/document-and-deploy.yml
index 03e25e5f..402c7b9e 100644
--- a/.github/workflows/document-and-deploy.yml
+++ b/.github/workflows/document-and-deploy.yml
@@ -53,7 +53,10 @@ jobs:
R -e "
remotes::install_github('ESHackathon/CiteSource', force = TRUE);
rsconnect::setAccountInfo(name=${{secrets.SHINY_LUKAS_ACCOUNT}}, token=${{secrets.SHINY_LUKAS_TOKEN}}, secret=${{secrets.SHINY_LUKAS_SECRET}});
- rsconnect::deployApp(appName = 'CiteSource_latest', appDir = './inst/shiny-app/CiteSource', forceUpdate = TRUE)"
+ rsconnect::deployApp(
+ appName = 'CiteSource_latest',
+ appDir = './inst/shiny-app/CiteSource',
+ forceUpdate = TRUE)"
- name: Deploy stable version from main
if: github.ref == 'refs/heads/main'
@@ -63,7 +66,10 @@ jobs:
R -e "
remotes::install_github('ESHackathon/CiteSource', force = TRUE);
rsconnect::setAccountInfo(name=${{secrets.SHINY_LUKAS_ACCOUNT}}, token=${{secrets.SHINY_LUKAS_TOKEN}}, secret=${{secrets.SHINY_LUKAS_SECRET}});
- rsconnect::deployApp(appName = 'CiteSource', appDir = './inst/shiny-app/CiteSource', forceUpdate = TRUE)"
+ rsconnect::deployApp(
+ appName = 'CiteSource',
+ appDir = './inst/shiny-app/CiteSource',
+ forceUpdate = TRUE)"
- name: Create pkgdown
env:
diff --git a/README.md b/README.md
index 931d34f0..227c24d2 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
[](https://www.gnu.org/licenses/gpl-3.0)
-## About the Pacakge
+## About the Package
CiteSource was developed to provide researchers the ability to examine the utility and efficacy of literature resources and search methodologies. The idea behind CiteSource is simply allowing users to deduplicate citation records, while maintaining customizable metadata about the citation.
@@ -48,7 +48,7 @@ Install CiteSource in R with remotes::install_github("ESHackathon/CiteSource")
**Vignettes**
-Vignettes covering various use cases can be found on the [CiteSource web page](https://www.eshackathon.org/CiteSource/).
+Vignettes covering various use cases can be found on the [CiteSource web page](https://www.eshackathon.org/CiteSource/articles).
## Feedback
diff --git a/inst/shiny-app/CiteSource/app.R b/inst/shiny-app/CiteSource/app.R
index a01605ab..de8ff54e 100644
--- a/inst/shiny-app/CiteSource/app.R
+++ b/inst/shiny-app/CiteSource/app.R
@@ -1,10 +1,12 @@
+options(shiny.maxRequestSize=10000*1024^2, timeout = 40000000)
+
library(DT)
library(CiteSource)
library(dplyr)
-# Set background color
-shiny::tags$head(shiny::tags$style(
- shiny::HTML('
+shiny::tags$head(
+ # style
+ shiny::tags$style(shiny::HTML('
#sidebar {
background-color: #ffffff;
}
@@ -12,7 +14,7 @@ shiny::tags$head(shiny::tags$style(
body, label, input, button, select {
font-family: "Arial";
}')
-))
+ ))
columns2hide <- c("title", "author", "doi", "volume",
"pages", "number", "year", "abstract", "journal", "isbn")
@@ -96,7 +98,8 @@ ui <- shiny::navbarPage("CiteSource",
shiny::mainPanel(
shiny::h5("Step 2: Double click on a column to edit sources, labels, and strings. Use *Ctrl+Enter* to save edits, one column at a time"),
# Output: Data file ----
- DT::dataTableOutput("tbl_out")
+ DT::dataTableOutput("tbl_out"),
+ shiny::uiOutput("post_upload_guide")
)
)
)
@@ -122,7 +125,7 @@ ui <- shiny::navbarPage("CiteSource",
shiny::tabPanel(
"Manual deduplication",
br(),
- shiny::h5("Step 4: Review potential duplicates manually"),
+ shiny::h5("Step 4 (optional): Manually select further duplicates"),
shiny::p("The following records were identified as potential duplicates. Potential duplicates are combined into a single row with metadata fields for each record represented (ex. Title 1 & Title 2). Click any row to indicate that the records in that row ARE duplicates. Once all duplicates are identified you can click the button 'Remove additional duplicates' and then proceed to the visualizations."),
shiny::textOutput("Manual_pretext"),
shiny::br(),
@@ -145,7 +148,7 @@ ui <- shiny::navbarPage("CiteSource",
) %>% htmltools::tagAppendAttributes(style = "background-color: #23395B"),
shinyWidgets::dropdown(
-
+
tags$h3("Select columns to display"),
shinyWidgets::pickerInput(
@@ -168,6 +171,15 @@ ui <- shiny::navbarPage("CiteSource",
tags$style(HTML(".table.dataTable tbody td.active, .table.dataTable tbody tr.active td {
background-color: #CBF7ED!important; color: black!important}")),
+ ),
+ shiny::tabPanel(
+ "How deduplication works",
+ shiny::fluidRow(
+ shiny::column(
+ 12,
+ shiny::uiOutput("dedup_logic_guide")
+ )
+ )
)
)
),
@@ -383,6 +395,9 @@ ui <- shiny::navbarPage("CiteSource",
# Define server logic to read selected file ----
server <- function(input, output, session) {
+
+ # --- Reactive Values ---
+ # Used to store data that changes during the session
rv <- shiny::reactiveValues()
rv$df <- data.frame()
rv$upload_df <- data.frame()#for original uploads
@@ -390,6 +405,106 @@ server <- function(input, output, session) {
rv$pairs_to_check <- data.frame()#for potential duplicates/manual dedup
rv$pairs_removed <- data.frame()#for removed records
+ # 1. The Container (Decides WHEN to show it)
+ output$post_upload_guide <- shiny::renderUI({
+ # Only show this if the upload dataframe exists and has rows
+ shiny::req(is.data.frame(rv$df) && nrow(rv$df) > 0)
+
+ shiny::tagList(
+ shiny::br(),
+ shiny::hr(),
+ shiny::h5("Tagging Overview"),
+ # We refer to the table output created below
+ shiny::tableOutput("guide_table_content")
+ )
+ })
+
+ # 2. The Table Content (Generates the table itself)
+ output$guide_table_content <- shiny::renderTable({
+ # We don't need the req() here because the UI above handles the hiding
+
+ data.frame(
+ "Column Name" = c("Source", "Label", "String"),
+ "Description" = c(
+ "'Source' is used to citations in files according to where the came from. This can include database names (e.g. Web of Science, Scopus) or a method used to find the citations (e.g. citation searching, numbered search string).",
+
+ "'Label' is used to tag citations in files with information related to their associated screening phase. The label field requires one of three terms: ‘search’, ‘screened’, or ‘final’. All plots/tables require at least one file to be labeled as ‘search’, no other terms in the label field are permitted. NOTE: files that are tagged as 'screened' or 'final' should not have a 'source' tag.",
+
+ "'String' is used to further differentiate sets of records. While the source/label fields alone can handle most use cases, the string field can be used to record other supplementary information a user may want to retain for analysis, or to further differentiate string variations (e.g. String1-narrow, String1-broad, String2-narrow, etc.)"
+ ),
+ check.names = FALSE
+ )
+ },
+ striped = TRUE,
+ hover = TRUE,
+ width = "100%",
+ align = "l",
+ sanitize.text.function = function(x) x # Allows the tags to work
+ )
+
+ # --- Google Analytics Integration ---
+ # Flag to ensure GA script is inserted only once per session
+ ga_script_inserted <- reactiveVal(FALSE)
+
+ # Use observeEvent on session$clientData which becomes available early
+ observeEvent(session$clientData, {
+ # Only proceed if the script hasn't been inserted yet for this session
+ if (!ga_script_inserted()) {
+ # Get the application's path from the URL (e.g., /CiteSource_latest/)
+ app_path <- session$clientData$url_pathname
+ ga_include_file <- NULL # Variable to hold the GA HTML filename
+
+ # --- Determine GA HTML filename based on the application path ---
+ # Check if the path ends with '_latest' or '_latest/' (case-insensitive)
+ if (grepl("_latest/?$", app_path, ignore.case = TRUE)) {
+ # Development version
+ message("GA: Detected DEV environment based on URL path: ", app_path) # Logging
+ # *** SET the DEV Google Analytics HTML filename ***
+ ga_include_file <- "google_analytics_dev.html" # file is in same directory as app.R
+
+ }
+ # Check if the path corresponds to the production app name (e.g., /CiteSource/ or /CiteSource)
+ # Adjust '/CiteSource/?$' if your production app name is different
+ else if (grepl("/CiteSource/?$", app_path, ignore.case = TRUE)) {
+ # Production version
+ message("GA: Detected PROD environment based on URL path: ", app_path) # Logging
+ # *** SET the PROD Google Analytics HTML filename ***
+ ga_include_file <- "google_analytics_main.html" # file is in same directory as app.R
+
+ } else {
+ # Path didn't match known patterns
+ message("GA: Could not determine environment from URL path: ", app_path) # Logging
+ }
+
+ # --- Insert the GA HTML file content if a filename was determined and file exists ---
+ if (!is.null(ga_include_file) && nzchar(ga_include_file)) {
+ # Check if the determined file actually exists in the app directory
+ if (file.exists(ga_include_file)) {
+ # Insert the content of the HTML file into the document's
+ insertUI(
+ selector = "head", # Target the tag
+ where = "beforeEnd", # Add the script at the end of the head's content
+ # Use includeHTML to read and insert the file content
+ ui = includeHTML(ga_include_file),
+ immediate = TRUE # Attempt to insert as soon as possible
+ )
+ # Set the flag to TRUE to prevent this code running again for this session
+ ga_script_inserted(TRUE)
+ message("GA: Inserted script from file: ", ga_include_file) # Logging
+ } else {
+ # Log an error if the file is missing
+ message("GA Error: HTML file not found: ", ga_include_file)
+ # Optionally set the flag anyway to prevent repeated checks for missing file
+ ga_script_inserted(TRUE)
+ }
+ } else {
+ # If no file was determined (e.g., path didn't match), set flag to prevent re-check
+ ga_script_inserted(TRUE)
+ }
+ }
+ }, ignoreNULL = TRUE, once = FALSE) # Trigger when clientData is available, but flag prevents re-run
+ # --- End Google Analytics Integration ---
+
#### Upload files tab section ------
# upload on click
shiny::observeEvent(input$file, {
@@ -873,6 +988,182 @@ server <- function(input, output, session) {
})
+ ## How Deduplication works tab
+
+ # 1. The Container (With Padding added)
+ output$dedup_logic_guide <- shiny::renderUI({
+
+ shiny::tagList(
+ shiny::br(),
+ # WRAPPER DIV: Adds 15px vertical and 30px horizontal padding
+ shiny::div(style = "padding: 15px 30px;",
+
+ shiny::h4("Deduplication Logic: ASySD Criteria"),
+ shiny::p("ASySD identifies duplicates in two phases. First, it blocks records into potential groups. Second, it scores text similarity. Finally, 'close calls' are flagged for manual review."),
+
+ shiny::hr(),
+
+ # The table output
+ shiny::tableOutput("dedup_guide_table_content")
+ )
+ )
+
+ # 1. The Container (UI with Accordions)
+ output$dedup_logic_guide <- shiny::renderUI({
+
+ shiny::tagList(
+ shiny::br(),
+ # Wrapper div with padding
+ shiny::div(style = "padding: 0px 15px; max-width: 1050px;",
+
+ shiny::h4("Deduplication Criteria"),
+ shiny::p("ASySD identifies duplicates in two automated phases, followed by a manual review safety net. Click a phase below to view the full logic."),
+
+ shiny::hr(),
+
+ # --- ACCORDION 1: BLOCKING ---
+ shiny::tags$details(
+ style = "border: 1px solid #ddd; border-radius: 5px; padding: 10px; margin-bottom: 10px;",
+ shiny::tags$summary(style = "cursor: pointer; font-weight: bold; font-size: 15px;",
+ "Phase 1: Blocking (The Wide Net)"),
+ shiny::br(),
+ shiny::p(style = "font-style: italic; font-size: 13px;",
+ "Records are grouped into potential duplicate sets if they match EXACTLY on any of these combinations."),
+ shiny::tableOutput("tbl_phase1")
+ ),
+
+ # --- ACCORDION 2: VALIDATION ---
+ shiny::tags$details(
+ style = "border: 1px solid #ddd; border-radius: 5px; padding: 10px; margin-bottom: 10px;",
+ shiny::tags$summary(style = "cursor: pointer; font-weight: bold; font-size: 15px;",
+ "Phase 2: Validation (The Strict Check)"),
+ shiny::br(),
+ shiny::p(style = "font-style: italic; font-size: 13px;",
+ "Candidate pairs are text-scored (0-100%). A pair is confirmed as a duplicate ONLY if it meets one of these threshold sets."),
+ shiny::tableOutput("tbl_phase2")
+ ),
+
+ # --- ACCORDION 3: MANUAL REVIEW ---
+ shiny::tags$details(
+ style = "border: 1px solid #ddd; border-radius: 5px; padding: 10px; margin-bottom: 10px;",
+ shiny::tags$summary(style = "cursor: pointer; font-weight: bold; font-size: 15px;",
+ "Phase 3: Manual Review (The Safety Net)"),
+ shiny::br(),
+ shiny::p(style = "font-style: italic; font-size: 13px;",
+ "Pairs that fall into the 'Grey Area' or have conflicting metadata are flagged for human review."),
+ shiny::tableOutput("tbl_phase3")
+ )
+ ) # End div
+ ) # End tagList
+ }) # End renderUI
+
+
+ # 2. Table Content - Phase 1 (Blocking)
+ output$tbl_phase1 <- shiny::renderTable({
+ data.frame(
+ "Category" = c("Round 1 (Broad)", "Round 2 (Bibliographic)", "Round 3 (Numeric)", "Round 4 (Loose)"),
+ "Criteria" = c(
+ "- Title & Pages
- Title & Author
- Title & Abstract
- DOI (Exact)
",
+ "- Author & Year & Pages
- Journal & Volume & Pages
- ISBN & Volume & Pages
- Title & ISBN
",
+ "- Year & Pages & Volume
- Year & Number & Volume
- Year & Pages & Number
",
+ "- Author & Year
- Year & Title
- Title & Volume
- Title & Journal
"
+ ),
+ check.names = FALSE
+ )
+ }, striped = TRUE, hover = TRUE, width = "100%", sanitize.text.function = function(x) x)
+ })
+
+
+ # 3. Table Content - Phase 2 (Validation - FULL DETAIL)
+ output$tbl_phase2 <- shiny::renderTable({
+ data.frame(
+ "Category" = c(
+ "Strict Bibliographic",
+ "Abstract Heavy",
+ "DOI Specific",
+ "Complex Metadata",
+ "Strict Journal + Abstract",
+ "High Confidence Metadata",
+ "High Numeric Confidence",
+ "Title & Journal/ISBN"
+ ),
+ "Criteria" = c(
+ # Strict Bibliographic
+ "
+ - Pages(>80%) + Vol(>80%) + Title(>90%) + Abstract(>90%) + Author(>50%) + ISBN(>99%)
+ - Pages(>80%) + Vol(>80%) + Title(>90%) + Abstract(>90%) + Author(>50%) + Journal(>60%)
+ - Pages(>80%) + No.(>80%) + Title(>90%) + Abstract(>90%) + Author(>50%) + Journal(>60%)
+ - Vol(>80%) + No.(>80%) + Title(>90%) + Abstract(>90%) + Author(>50%) + Journal(>60%)
+
",
+
+ # Abstract Heavy
+ "
+ - Vol(>80%) + No.(>80%) + Title(>90%) + Abstract(>90%) + Author(>80%)
+ - Vol(>80%) + Pages(>80%) + Title(>90%) + Abstract(>90%) + Author(>80%)
+ - Pages(>80%) + No.(>80%) + Title(>90%) + Abstract(>90%) + Author(>80%)
+
",
+
+ # DOI Specific
+ "- DOI(>95%) + Author(>75%) + Title(>90%)
",
+
+ # Complex Metadata
+ "
+ - Title(>80%) + Abstract(>90%) + Vol(>85%) + Journal(>65%) + Author(>90%)
+ - Title(>90%) + Abstract(>80%) + Vol(>85%) + Journal(>65%) + Author(>90%)
+
",
+
+ # Strict Journal & Abstract
+ "
+ - Pages(>80%) + Vol(>80%) + Title(>90%) + Abstract(>80%) + Author(>90%) + Journal(>75%)
+ - Pages(>80%) + No.(>80%) + Title(>90%) + Abstract(>80%) + Author(>90%) + Journal(>75%)
+ - Vol(>80%) + No.(>80%) + Title(>90%) + Abstract(>80%) + Author(>90%) + Journal(>75%)
+
",
+
+ # High Confidence Metadata
+ "
+ - Title(>90%) + Author(>90%) + Abstract(>90%) + Journal(>70%)
+ - Title(>90%) + Author(>90%) + Abstract(>90%) + ISBN(>99%)
+
",
+
+ # High Numeric Confidence
+ "
+ - Pages(>90%) + No.(>90%) + Title(>90%) + Author(>80%) + Journal(>60%)
+ - No.(>90%) + Vol(>90%) + Title(>90%) + Author(>90%) + ISBN(>99%)
+ - Pages(>90%) + Vol(>90%) + Title(>90%) + Author(>80%) + Journal(>60%)
+ - Pages(>90%) + No.(>90%) + Title(>90%) + Author(>80%) + ISBN(>99%)
+
",
+
+ # Title & Journal/ISBN Specific
+ "
+ - Pages(>80%) + Vol(>80%) + Title(>95%) + Author(>80%) + Journal(>90%)
+ - No.(>80%) + Vol(>80%) + Title(>95%) + Author(>80%) + Journal(>90%)
+ - No.(>80%) + Pages(>80%) + Title(>95%) + Author(>80%) + Journal(>90%)
+ - Pages(>80%) + Vol(>80%) + Title(>95%) + Author(>80%) + ISBN(>99%)
+
"
+ ),
+ check.names = FALSE
+ )
+ }, striped = TRUE, hover = TRUE, width = "100%", sanitize.text.function = function(x) x)
+
+
+ # 4. Table Content - Phase 3 (Manual)
+ output$tbl_phase3 <- shiny::renderTable({
+ data.frame(
+ "Category" = c("The 'Grey Area'", "Conflicting DOI", "Year Mismatch"),
+ "Criteria" = c(
+ "
+ - Title(>85%) + Author(>75%)
+ - Title(>80%) + Abstract(>80%)
+ - Title(>80%) + ISBN(>99%)
+ - Title(>80%) + Journal(>80%)
+
",
+ "Pairs that match perfectly but have different DOIs.",
+ "Pairs that match perfectly but are published >1 year apart."
+ ),
+ check.names = FALSE
+ )
+ }, striped = TRUE, hover = TRUE, width = "100%", sanitize.text.function = function(x) x)
+
#### Visualise tab ####
# Reactive expression to filter the data for visualization (used for Heatmap and Upset)
@@ -1497,7 +1788,7 @@ server <- function(input, output, session) {
return(detailed_counts_final)
})
-
+
# Rendering the detailed record table
output$detailedRecordTab <- gt::render_gt({
# Check if base data is loaded
diff --git a/inst/shiny-app/CiteSource/google_analytics_dev.html b/inst/shiny-app/CiteSource/google_analytics_dev.html
new file mode 100644
index 00000000..f31c3d31
--- /dev/null
+++ b/inst/shiny-app/CiteSource/google_analytics_dev.html
@@ -0,0 +1,9 @@
+
+
+
\ No newline at end of file
diff --git a/inst/shiny-app/CiteSource/google_analytics_main.html b/inst/shiny-app/CiteSource/google_analytics_main.html
new file mode 100644
index 00000000..eb4691bd
--- /dev/null
+++ b/inst/shiny-app/CiteSource/google_analytics_main.html
@@ -0,0 +1,9 @@
+
+
+
\ No newline at end of file