diff --git a/notebooks/2020-05-19_CSV_data_wrangling.ipynb b/notebooks/2020-05-19_CSV_data_wrangling.ipynb new file mode 100644 index 0000000..3793839 --- /dev/null +++ b/notebooks/2020-05-19_CSV_data_wrangling.ipynb @@ -0,0 +1,1421 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data wrangling: CSV to NetCDF\n", + "A user came to us as she had issues with some model outputs. The model outputs to CSV. That's a land model, so there are outputs only on the land points. The data is organised as so for each point, the data for each successive year is in a new row and the data for each month is in the column of this month (Jan, Feb, etc.) for that row. See below the output after reading in the data to make it clearer.\n", + "\n", + "She wanted to store the data in a (lat, lon, time) netcdf file. She would like the longitude and latitude to cover both the land and ocean (it's a 0.5° regular grid) and we also need to figure out how to combine the year and month information into one time information.\n", + "\n", + "Xarray does not read in CSV, we will need to read in the file with Pandas and then figure out how to convert it to DataArray and write it to NetCDF." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the data and dimensions" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import xarray as xr" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | Lon | \n", + "Lat | \n", + "Year | \n", + "Jan | \n", + "Feb | \n", + "Mar | \n", + "Apr | \n", + "May | \n", + "Jun | \n", + "Jul | \n", + "Aug | \n", + "Sep | \n", + "Oct | \n", + "Nov | \n", + "Dec | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "39.75 | \n", + "-1.25 | \n", + "1901 | \n", + "0.040 | \n", + "0.039 | \n", + "0.069 | \n", + "0.144 | \n", + "0.056 | \n", + "0.022 | \n", + "0.017 | \n", + "0.005 | \n", + "0.031 | \n", + "0.027 | \n", + "0.132 | \n", + "0.204 | \n", + "
| 1 | \n", + "39.75 | \n", + "-1.25 | \n", + "1902 | \n", + "0.158 | \n", + "0.014 | \n", + "0.052 | \n", + "0.086 | \n", + "0.099 | \n", + "0.046 | \n", + "0.013 | \n", + "0.011 | \n", + "0.024 | \n", + "0.086 | \n", + "0.221 | \n", + "0.089 | \n", + "
| 2 | \n", + "39.75 | \n", + "-1.25 | \n", + "1903 | \n", + "0.012 | \n", + "0.010 | \n", + "0.024 | \n", + "0.140 | \n", + "0.056 | \n", + "0.038 | \n", + "0.015 | \n", + "0.015 | \n", + "0.038 | \n", + "0.020 | \n", + "0.033 | \n", + "0.096 | \n", + "
| 3 | \n", + "39.75 | \n", + "-1.25 | \n", + "1904 | \n", + "0.046 | \n", + "-0.002 | \n", + "0.020 | \n", + "0.051 | \n", + "0.111 | \n", + "0.032 | \n", + "0.004 | \n", + "-0.001 | \n", + "0.024 | \n", + "0.048 | \n", + "0.130 | \n", + "0.049 | \n", + "
| 4 | \n", + "39.75 | \n", + "-1.25 | \n", + "1905 | \n", + "0.035 | \n", + "0.000 | \n", + "0.166 | \n", + "0.164 | \n", + "0.100 | \n", + "0.039 | \n", + "0.022 | \n", + "0.020 | \n", + "0.010 | \n", + "0.024 | \n", + "0.169 | \n", + "0.110 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 6806845 | \n", + "92.75 | \n", + "40.75 | \n", + "2011 | \n", + "0.000 | \n", + "0.000 | \n", + "0.004 | \n", + "0.005 | \n", + "0.001 | \n", + "0.019 | \n", + "0.001 | \n", + "0.001 | \n", + "0.001 | \n", + "0.003 | \n", + "0.003 | \n", + "0.000 | \n", + "
| 6806846 | \n", + "92.75 | \n", + "40.75 | \n", + "2012 | \n", + "0.000 | \n", + "0.000 | \n", + "0.003 | \n", + "0.005 | \n", + "0.000 | \n", + "0.020 | \n", + "0.022 | \n", + "0.008 | \n", + "0.020 | \n", + "0.003 | \n", + "0.001 | \n", + "0.000 | \n", + "
| 6806847 | \n", + "92.75 | \n", + "40.75 | \n", + "2013 | \n", + "0.000 | \n", + "0.000 | \n", + "0.008 | \n", + "0.001 | \n", + "0.008 | \n", + "0.017 | \n", + "0.016 | \n", + "0.021 | \n", + "0.001 | \n", + "0.002 | \n", + "0.002 | \n", + "0.000 | \n", + "
| 6806848 | \n", + "92.75 | \n", + "40.75 | \n", + "2014 | \n", + "0.000 | \n", + "0.000 | \n", + "0.000 | \n", + "-0.000 | \n", + "-0.000 | \n", + "0.004 | \n", + "0.027 | \n", + "0.004 | \n", + "0.001 | \n", + "0.003 | \n", + "0.003 | \n", + "0.000 | \n", + "
| 6806849 | \n", + "92.75 | \n", + "40.75 | \n", + "2015 | \n", + "0.000 | \n", + "0.000 | \n", + "0.002 | \n", + "0.006 | \n", + "0.008 | \n", + "0.000 | \n", + "0.015 | \n", + "0.005 | \n", + "0.002 | \n", + "0.002 | \n", + "0.004 | \n", + "0.000 | \n", + "
6806850 rows × 15 columns
\n", + "array([[[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]],\n", + "\n", + " [[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]],\n", + "\n", + " [[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]],\n", + "\n", + " ...,\n", + "\n", + " [[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]],\n", + "\n", + " [[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]],\n", + "\n", + " [[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]]])
array([-89.75, -89.25, -88.75, ..., 88.75, 89.25, 89.75])
array([-179.75, -179.25, -178.75, ..., 178.75, 179.25, 179.75])
array(['1901-01-31T00:00:00.000000000', '1901-02-28T00:00:00.000000000',\n", + " '1901-03-31T00:00:00.000000000', ..., '2015-10-31T00:00:00.000000000',\n", + " '2015-11-30T00:00:00.000000000', '2015-12-31T00:00:00.000000000'],\n", + " dtype='datetime64[ns]')
| \n", + " | \n", + " | \n", + " | 0 | \n", + "
|---|---|---|---|
| Lon | \n", + "Lat | \n", + "Time | \n", + "\n", + " |
| -179.75 | \n", + "66.25 | \n", + "1901-01-31 | \n", + "0.0 | \n", + "
| 1901-02-28 | \n", + "0.0 | \n", + "||
| 1901-03-31 | \n", + "0.0 | \n", + "||
| 1901-04-30 | \n", + "0.0 | \n", + "||
| 1901-05-31 | \n", + "0.0 | \n", + "||
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 179.75 | \n", + "71.25 | \n", + "2015-08-31 | \n", + "0.0 | \n", + "
| 2015-09-30 | \n", + "0.0 | \n", + "||
| 2015-10-31 | \n", + "0.0 | \n", + "||
| 2015-11-30 | \n", + "0.0 | \n", + "||
| 2015-12-31 | \n", + "0.0 | \n", + "
81682200 rows × 1 columns
\n", + "