diff --git a/_freeze/posts/UntitledRMD/execute-results/html.json b/_freeze/posts/UntitledRMD/execute-results/html.json new file mode 100644 index 0000000..ef6e2de --- /dev/null +++ b/_freeze/posts/UntitledRMD/execute-results/html.json @@ -0,0 +1,16 @@ +{ + "hash": "b02f9191faa1301cdaf7e7aa63a7b929", + "result": { + "markdown": "---\ntitle: \"Exploratory Data Analysis\"\nauthor: \"Ning Duan\"\ndate: \"10/26/2022\"\noutput: pdf_document\n---\n\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(readr)\nall_data_guage_labworks <- read_csv(\"/Users/duanning/Downloads/Regression-Modelling/posts/_data/all_data_guage_labworks.csv\")\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nNew names:\nRows: 159 Columns: 29\n── Column specification\n──────────────────────────────────────────────────────── Delimiter: \",\" chr\n(17): Agency_cd, Name, Tz_cd, Ph-Status, Do-Status, Sc-Status, Turb-Stat... dbl\n(12): ...1, Site_no, Datetime, Ph, Do, Sc, Turb_fnu, Wl_elev, Temp_c, Q_...\nℹ Use `spec()` to retrieve the full column specification for this data. ℹ\nSpecify the column types or set `show_col_types = FALSE` to quiet this message.\n• `` -> `...1`\n```\n:::\n\n```{.r .cell-code}\ncolnames(all_data_guage_labworks)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n [1] \"...1\" \"Agency_cd\" \"Name\" \"Site_no\" \n [5] \"Datetime\" \"Tz_cd\" \"Ph\" \"Ph-Status\" \n [9] \"Do\" \"Do-Status\" \"Sc\" \"Sc-Status\" \n[13] \"Turb_fnu\" \"Turb-Status\" \"Wl_elev\" \"Wl_elev-Status\"\n[17] \"Temp_c\" \"Temp-Status\" \"Q_cfs\" \"Q-Status\" \n[21] \"Stage_ft\" \"Stage - Status\" \"Precip_in\" \"Precip-Status\" \n[25] \"Waterbody\" \"Lcod\" \"Ecoli_mpn\" \"Fecal_cfu\" \n[29] \"Turbidity_ntu\" \n```\n:::\n\n```{.r .cell-code}\nsummary(all_data_guage_labworks)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n ...1 Agency_cd Name Site_no \n Min. : 1.0 Length:159 Length:159 Min. :3451500 \n 1st Qu.: 40.5 Class :character Class :character 1st Qu.:3451500 \n Median : 80.0 Mode :character Mode :character Median :3451500 \n Mean : 80.0 Mean :3451500 \n 3rd Qu.:119.5 3rd Qu.:3451500 \n Max. :159.0 Max. :3451500 \n \n Datetime Tz_cd Ph Ph-Status \n Min. :1.559e+09 Length:159 Min. :6.400 Length:159 \n 1st Qu.:1.593e+09 Class :character 1st Qu.:6.900 Class :character \n Median :1.623e+09 Mode :character Median :7.000 Mode :character \n Mean :1.616e+09 Mean :6.972 \n 3rd Qu.:1.648e+09 3rd Qu.:7.100 \n Max. :1.663e+09 Max. :7.400 \n NA's :4 \n Do Do-Status Sc Sc-Status \n Min. : 6.700 Length:159 Min. :30.00 Length:159 \n 1st Qu.: 7.700 Class :character 1st Qu.:41.00 Class :character \n Median : 8.000 Mode :character Median :45.00 Mode :character \n Mean : 8.353 Mean :44.77 \n 3rd Qu.: 8.600 3rd Qu.:48.00 \n Max. :12.800 Max. :59.00 \n NA's :4 NA's :4 \n Turb_fnu Turb-Status Wl_elev Wl_elev-Status \n Min. : 2.80 Length:159 Min. :1951 Length:159 \n 1st Qu.: 8.55 Class :character 1st Qu.:1952 Class :character \n Median : 16.35 Mode :character Median :1952 Mode :character \n Mean : 28.81 Mean :1952 \n 3rd Qu.: 29.82 3rd Qu.:1953 \n Max. :354.00 Max. :1958 \n NA's :13 NA's :62 \n Temp_c Temp-Status Q_cfs Q-Status \n Min. : 1.90 Length:159 Min. : 975 Length:159 \n 1st Qu.:18.00 Class :character 1st Qu.: 1460 Class :character \n Median :21.00 Mode :character Median : 1850 Mode :character \n Mean :19.27 Mean : 2521 \n 3rd Qu.:22.20 3rd Qu.: 2580 \n Max. :24.60 Max. :16600 \n NA's :2 \n Stage_ft Stage - Status Precip_in Precip-Status \n Min. :1.540 Length:159 Min. :0.000000 Length:159 \n 1st Qu.:1.900 Class :character 1st Qu.:0.000000 Class :character \n Median :2.150 Mode :character Median :0.000000 Mode :character \n Mean :2.473 Mean :0.001258 \n 3rd Qu.:2.600 3rd Qu.:0.000000 \n Max. :8.410 Max. :0.070000 \n \n Waterbody Lcod Ecoli_mpn Fecal_cfu \n Length:159 Length:159 Length:159 Length:159 \n Class :character Class :character Class :character Class :character \n Mode :character Mode :character Mode :character Mode :character \n \n \n \n \n Turbidity_ntu \n Length:159 \n Class :character \n Mode :character \n \n \n \n \n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$Tz_cd)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] \"EDT\" \"EST\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Ph-Status`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] NA \"A\" \"P\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Do-Status`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] NA \"A\" \"P\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Sc-Status`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] NA \"A\" \"P\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Turb-Status`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] NA \"A\" \"P\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Wl_elev-Status`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] NA \"P\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Temp-Status`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] \"A\" NA \"P\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Q-Status`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] \"A\" \"P\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Stage - Status`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] \"A\" \"P\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Precip-Status`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] \"A\" \"P\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Waterbody`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] \"FBR\"\n```\n:::\n\n```{.r .cell-code}\nunique(all_data_guage_labworks$`Lcod`)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] \"FRBECOLI2\"\n```\n:::\n:::\n\n\nResponse: \"Ecoli_mpn\" (lab result).\n\nPredictors: \"Ph\" \"Do\" \"Sc\" \"Turb_fnu\" \"Wl_elev\" \"Temp_c\" \"Q_cfs\" \"Stage_ft\" \"Precip_in\"\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(ggplot2)\n\nggplot(all_data_guage_labworks, aes(x = 1:nrow(all_data_guage_labworks), y = as.numeric(all_data_guage_labworks$Ecoli_mpn))) +\n geom_point()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning in FUN(X[[i]], ...): NAs introduced by coercion\n\nWarning in FUN(X[[i]], ...): NAs introduced by coercion\n```\n:::\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning: Removed 2 rows containing missing values (geom_point).\n```\n:::\n\n::: {.cell-output-display}\n{width=672}\n:::\n\n```{.r .cell-code}\nggplot(all_data_guage_labworks, aes(x = 1:nrow(all_data_guage_labworks), y = all_data_guage_labworks$Ph)) +\n geom_point()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning: Removed 4 rows containing missing values (geom_point).\n```\n:::\n\n::: {.cell-output-display}\n{width=672}\n:::\n\n```{.r .cell-code}\nggplot(all_data_guage_labworks, aes(x = 1:nrow(all_data_guage_labworks), y = all_data_guage_labworks$Do)) +\n geom_point()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning: Removed 4 rows containing missing values (geom_point).\n```\n:::\n\n::: {.cell-output-display}\n{width=672}\n:::\n\n```{.r .cell-code}\nggplot(all_data_guage_labworks, aes(x = 1:nrow(all_data_guage_labworks), y = all_data_guage_labworks$Sc)) +\n geom_point()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning: Removed 4 rows containing missing values (geom_point).\n```\n:::\n\n::: {.cell-output-display}\n{width=672}\n:::\n\n```{.r .cell-code}\nggplot(all_data_guage_labworks, aes(x = 1:nrow(all_data_guage_labworks), y = all_data_guage_labworks$Turb_fnu)) +\n geom_point()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning: Removed 13 rows containing missing values (geom_point).\n```\n:::\n\n::: {.cell-output-display}\n{width=672}\n:::\n\n```{.r .cell-code}\nggplot(all_data_guage_labworks, aes(x = 1:nrow(all_data_guage_labworks), y = all_data_guage_labworks$Wl_elev)) +\n geom_point()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning: Removed 62 rows containing missing values (geom_point).\n```\n:::\n\n::: {.cell-output-display}\n{width=672}\n:::\n\n```{.r .cell-code}\nggplot(all_data_guage_labworks, aes(x = 1:nrow(all_data_guage_labworks), y = all_data_guage_labworks$Temp_c)) +\n geom_point()\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning: Removed 2 rows containing missing values (geom_point).\n```\n:::\n\n::: {.cell-output-display}\n{width=672}\n:::\n\n```{.r .cell-code}\nggplot(all_data_guage_labworks, aes(x = 1:nrow(all_data_guage_labworks), y = all_data_guage_labworks$Q_cfs)) +\n geom_point()\n```\n\n::: {.cell-output-display}\n{width=672}\n:::\n\n```{.r .cell-code}\nggplot(all_data_guage_labworks, aes(x = 1:nrow(all_data_guage_labworks), y = all_data_guage_labworks$Stage_ft)) +\n geom_point()\n```\n\n::: {.cell-output-display}\n{width=672}\n:::\n\n```{.r .cell-code}\nggplot(all_data_guage_labworks, aes(x = 1:nrow(all_data_guage_labworks), y = all_data_guage_labworks$Precip_in)) +\n geom_point()\n```\n\n::: {.cell-output-display}\n{width=672}\n:::\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\npairs(~ as.numeric(Ecoli_mpn) + Ph + Do+ Sc + Turb_fnu + Wl_elev + Temp_c + Q_cfs + Stage_ft + Precip_in, data = all_data_guage_labworks)\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning in eval(predvars, data, env): NAs introduced by coercion\n```\n:::\n\n::: {.cell-output-display}\n{width=672}\n:::\n:::\n", + "supporting": [ + "UntitledRMD_files" + ], + "filters": [ + "rmarkdown/pagebreak.lua" + ], + "includes": {}, + "engineDependencies": {}, + "preserve": {}, + "postProcess": true + } +} \ No newline at end of file diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-1.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-1.png new file mode 100644 index 0000000..2c3551d Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-1.png differ diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-10.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-10.png new file mode 100644 index 0000000..b6a6296 Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-10.png differ diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-2.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-2.png new file mode 100644 index 0000000..68598fe Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-2.png differ diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-3.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-3.png new file mode 100644 index 0000000..f1063b1 Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-3.png differ diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-4.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-4.png new file mode 100644 index 0000000..4915dce Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-4.png differ diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-5.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-5.png new file mode 100644 index 0000000..f35f247 Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-5.png differ diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-6.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-6.png new file mode 100644 index 0000000..8c45882 Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-6.png differ diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-7.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-7.png new file mode 100644 index 0000000..1030251 Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-7.png differ diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-8.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-8.png new file mode 100644 index 0000000..de89f46 Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-8.png differ diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-9.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-9.png new file mode 100644 index 0000000..5d381f0 Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-1-9.png differ diff --git a/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-2-1.png b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-2-1.png new file mode 100644 index 0000000..a2f48c7 Binary files /dev/null and b/_freeze/posts/UntitledRMD/figure-html/unnamed-chunk-2-1.png differ diff --git a/posts/UntitledRMD.Rmd b/posts/UntitledRMD.Rmd deleted file mode 100644 index 9024117..0000000 --- a/posts/UntitledRMD.Rmd +++ /dev/null @@ -1 +0,0 @@ -testing..... do not consider this file \ No newline at end of file diff --git a/posts/data_preprocessing_files/execute-results/html.json b/posts/data_preprocessing_files/execute-results/html.json new file mode 100644 index 0000000..cd23191 --- /dev/null +++ b/posts/data_preprocessing_files/execute-results/html.json @@ -0,0 +1,20 @@ +{ + "hash": "9f66d01b542f08ea5a92791a0758dd38", + "result": { + "markdown": "---\ntitle: \"Data Preprocessing\"\nauthor: \"Sathvik\"\ndesription: \"Data Preprocessing\"\ndate: \"10/09/2022\"\nformat:\n html:\n df-print: paged\n toc: true\n code-fold: true\n code-copy: true\n code-tools: true\ncategories:\n - data preprocessing\n---\n\n\n## Libraries\n\n\n\n\n\n## Load datasets\n\n\n::: {.cell}\n\n```{.r .cell-code}\nduans <- read_excel(\"_data/n=149 with Duans.xlsx\")\nlabworks <- read_excel(\"_data/091522 LABWORKS DOWNLOAD.xlsx\") %>% filter(NAME == \"PEARSON\") %>% \n select(-c(UNITS, STATUS))\nusgs_gauge_data <- read_excel(\"_data/1 of 3 - USGS gage data downloaded 092022.xlsx\",\n sheet = \"Pearson\", guess_max = 1048576)\n```\n:::\n\n\n\nHad a problem while loading the data.`guess_max` determines how many cells in each column are used to make a guess of the column type. we can provide a `guess_max` for read_excel to correctly guess the column type. \n\n\n## Data and the datatypes of Columns\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndm_draw(dm(duans, labworks, usgs_gauge_data), view_type = \"all\", column_types = TRUE, focus = TRUE)\n```\n\n::: {.cell-output-display}\n```{=html}\n
\n\n```\n:::\n:::\n\n\n\ncleaning columns with correct data types\n\nlabworks$RESULT is character type in the table, correcting it to numeric type. \"NA\" value introduced while correcting data type in (only one)observation where result is \">24200\". \n\nFiltered usgs_guage_data for the labworks datetime range\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlabworks$RESULT <- as.numeric(labworks$RESULT)\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning: NAs introduced by coercion\n```\n:::\n\n```{.r .cell-code}\nlabworks$WATERBODY <- as.factor(labworks$WATERBODY)\nlabworks$LCOD <- as.factor(labworks$LCOD)\nlabworks$PARAMETER <- as.factor(labworks$PARAMETER)\nlabworks <- labworks %>% rename(\"datetime\" = \"DATETIME\")\n# labworks <- labworks %>% rename(\"Name\" = \"NAME\")\n\nusgs_gauge_data <- usgs_gauge_data %>% \n filter(between (datetime, min(labworks$datetime), max(labworks$datetime)))\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nlabworks <- labworks %>% \n pivot_wider(names_from = PARAMETER, values_from = RESULT)\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nWarning: Values from `RESULT` are not uniquely identified; output will contain list-cols.\n* Use `values_fn = list` to suppress this warning.\n* Use `values_fn = {summary_fun}` to summarise duplicates.\n* Use the following dplyr code to identify duplicates.\n {data} %>%\n dplyr::group_by(NAME, WATERBODY, LCOD, datetime, PARAMETER) %>%\n dplyr::summarise(n = dplyr::n(), .groups = \"drop\") %>%\n dplyr::filter(n > 1L)\n```\n:::\n:::\n\n\n\nConverting the dataframes to data.table and merging them to the nearest datetime.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlabworks <- data.table(labworks)\nusgs_gauge_data <- data.table(usgs_gauge_data)\nduans <- data.table(duans)\n\nsetkey(labworks, datetime)\nsetkey(usgs_gauge_data, datetime)\nsetkey(duans, datetime)\n\nlabworks_USGS_combined <- usgs_gauge_data[labworks , roll = \"nearest\" ] %>% select(-NAME)\n\nall_data <- duans[labworks_USGS_combined, roll = \"nearest\"]\n\nnames(all_data) <- str_to_title(names(all_data))\n```\n:::\n\n\nInformation about [data.table](https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.html)