diff --git a/notebooks/2020-05-19_CSV_data_wrangling.ipynb b/notebooks/2020-05-19_CSV_data_wrangling.ipynb new file mode 100644 index 0000000..3793839 --- /dev/null +++ b/notebooks/2020-05-19_CSV_data_wrangling.ipynb @@ -0,0 +1,1421 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data wrangling: CSV to NetCDF\n", + "A user came to us as she had issues with some model outputs. The model outputs to CSV. That's a land model, so there are outputs only on the land points. The data is organised as so for each point, the data for each successive year is in a new row and the data for each month is in the column of this month (Jan, Feb, etc.) for that row. See below the output after reading in the data to make it clearer.\n", + "\n", + "She wanted to store the data in a (lat, lon, time) netcdf file. She would like the longitude and latitude to cover both the land and ocean (it's a 0.5° regular grid) and we also need to figure out how to combine the year and month information into one time information.\n", + "\n", + "Xarray does not read in CSV, we will need to read in the file with Pandas and then figure out how to convert it to DataArray and write it to NetCDF." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the data and dimensions" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import xarray as xr" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LonLatYearJanFebMarAprMayJunJulAugSepOctNovDec
039.75-1.2519010.0400.0390.0690.1440.0560.0220.0170.0050.0310.0270.1320.204
139.75-1.2519020.1580.0140.0520.0860.0990.0460.0130.0110.0240.0860.2210.089
239.75-1.2519030.0120.0100.0240.1400.0560.0380.0150.0150.0380.0200.0330.096
339.75-1.2519040.046-0.0020.0200.0510.1110.0320.004-0.0010.0240.0480.1300.049
439.75-1.2519050.0350.0000.1660.1640.1000.0390.0220.0200.0100.0240.1690.110
................................................
680684592.7540.7520110.0000.0000.0040.0050.0010.0190.0010.0010.0010.0030.0030.000
680684692.7540.7520120.0000.0000.0030.0050.0000.0200.0220.0080.0200.0030.0010.000
680684792.7540.7520130.0000.0000.0080.0010.0080.0170.0160.0210.0010.0020.0020.000
680684892.7540.7520140.0000.0000.000-0.000-0.0000.0040.0270.0040.0010.0030.0030.000
680684992.7540.7520150.0000.0000.0020.0060.0080.0000.0150.0050.0020.0020.0040.000
\n", + "

6806850 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " Lon Lat Year Jan Feb Mar Apr May Jun Jul \\\n", + "0 39.75 -1.25 1901 0.040 0.039 0.069 0.144 0.056 0.022 0.017 \n", + "1 39.75 -1.25 1902 0.158 0.014 0.052 0.086 0.099 0.046 0.013 \n", + "2 39.75 -1.25 1903 0.012 0.010 0.024 0.140 0.056 0.038 0.015 \n", + "3 39.75 -1.25 1904 0.046 -0.002 0.020 0.051 0.111 0.032 0.004 \n", + "4 39.75 -1.25 1905 0.035 0.000 0.166 0.164 0.100 0.039 0.022 \n", + "... ... ... ... ... ... ... ... ... ... ... \n", + "6806845 92.75 40.75 2011 0.000 0.000 0.004 0.005 0.001 0.019 0.001 \n", + "6806846 92.75 40.75 2012 0.000 0.000 0.003 0.005 0.000 0.020 0.022 \n", + "6806847 92.75 40.75 2013 0.000 0.000 0.008 0.001 0.008 0.017 0.016 \n", + "6806848 92.75 40.75 2014 0.000 0.000 0.000 -0.000 -0.000 0.004 0.027 \n", + "6806849 92.75 40.75 2015 0.000 0.000 0.002 0.006 0.008 0.000 0.015 \n", + "\n", + " Aug Sep Oct Nov Dec \n", + "0 0.005 0.031 0.027 0.132 0.204 \n", + "1 0.011 0.024 0.086 0.221 0.089 \n", + "2 0.015 0.038 0.020 0.033 0.096 \n", + "3 -0.001 0.024 0.048 0.130 0.049 \n", + "4 0.020 0.010 0.024 0.169 0.110 \n", + "... ... ... ... ... ... \n", + "6806845 0.001 0.001 0.003 0.003 0.000 \n", + "6806846 0.008 0.020 0.003 0.001 0.000 \n", + "6806847 0.021 0.001 0.002 0.002 0.000 \n", + "6806848 0.004 0.001 0.003 0.003 0.000 \n", + "6806849 0.005 0.002 0.002 0.004 0.000 \n", + "\n", + "[6806850 rows x 15 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fname = \"/g/data/w35/lt0205/research/lpj_guess/runs/CRUNCEP/mgpp.out\"\n", + "df = pd.read_csv(fname, header=0, delim_whitespace=True)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "months=list(df.columns)\n", + "months=months[3:]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "lons = np.unique(df.Lon)\n", + "lats = np.unique(df.Lat)\n", + "years = np.unique(df.Year)\n", + "nyears = len(years)\n", + "nrows = len(lats)\n", + "ncols = len(lons)\n", + "nmonths = 12\n", + "lons.sort()\n", + "lats.sort()\n", + "years.sort()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Original code.\n", + "The person requesting help sent us the codes she wrote to try and solve the issue. We put one of those below to discuss the good and bad of it.\n", + "\n", + "```python\n", + "out = np.zeros((ncols, nrows, nyears*nmonths))\n", + "for i,lon in enumerate(lons):\n", + " print(i, ncols)\n", + " for j,lat in enumerate(lats):\n", + " #print(j)\n", + " vals = df[(df.Lat == lat) & (df.Lon == lon)].values[:,3:]\n", + " if len(vals)> 0:\n", + " print(\"Reshape\")\n", + " vals = vals.reshape(nyears*nmonths)\n", + " out[i,j,:] = vals\n", + "\n", + "t1 = pd.to_datetime('1/1/1901')\n", + "time = t1 + pd.to_timedelta(np.arange(nmonths*nyears), 'M')\n", + "\n", + "ds = xr.Dataset(data_vars={\"mgpp\":([\"y\", \"x\", \"time\"],out)},\n", + " coords={\"lat\": ([\"y\"], lats),\n", + " \"lon\": ([\"x\"], lons),\n", + " \"time\": time})\n", + "ds.to_netcdf('test.nc')\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The good idea in this code is not to loop over the times but only looping over the spatial points. The bad idea is the original dataset only has the land points. So by looping on all (lon, lat) pairs, we are looping over points that do not have any data in the DataFrame.\n", + "\n", + "The original DataFrame has 59190 spatial points (length of the Frame / # of years). The loop goes through 194878 values, i.e about 3 times more than needed. \n", + "\n", + "So let's estimate the time it needs to run. We'll time the loop for 1 iteration and then multiply by the number of iterations." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "21.3 ms ± 2.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "out = np.zeros((ncols, nrows, nyears*nmonths))\n", + "for i,lon in enumerate(lons[0:1]):\n", + " #print(i, ncols)\n", + " for j,lat in enumerate(lats[0:1]):\n", + " #print(j)\n", + " vals = df[(df.Lat == lat) & (df.Lon == lon)].values[:,3:]\n", + " if len(vals)> 0:\n", + " vals = vals.reshape(nyears*nmonths)\n", + " out[i,j,:] = vals" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100.03737333333332" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Time for total loop in minutes:\n", + "(30.8 * nrows*ncols)/1000/60" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So we need something that runs faster than 100 min." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## First solution: a loop\n", + "Instead of looping through all the spatial points of the output array, we can loop through the rows of the DataFrame and find the correct place in the output array to put this data.\n", + "\n", + "First, let's create the output DataArray so we can easily find the location of points using longitude and latitude. Instead of using only the land points, we'll create the array for the full grid directly and fill with NaN." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the axes\n", + "time = pd.date_range(start=f'01/{years[0]}',\n", + " end =f'01/{years[-1]+1}', freq='M')\n", + "# We'll use a generic way to create a regular grid from [-180,180] and\n", + "# [-90, 90] when knowing the resolution. Feel free to reuse as needed.\n", + "dx = 0.5\n", + "Lon = xr.DataArray(np.arange(-180.+dx/2., 180., dx), dims=(\"Lon\"),\n", + " attrs={\"long_name\":\"longitude\", \"unit\":\"degrees_east\"})\n", + "nlon = Lon.size\n", + "dy = 0.5\n", + "Lat = xr.DataArray(np.arange(-90.+dy/2., 90., dy), dims=(\"Lat\"),\n", + " attrs={\"long_name\":\"latitude\", \"unit\":\"degrees_north\"})\n", + "nlat = Lat.size" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "Show/Hide data repr\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Show/Hide attributes\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
xarray.DataArray
  • Lat: 360
  • Lon: 720
  • Time: 1380
" + ], + "text/plain": [ + "\n", + "array([[[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]],\n", + "\n", + " [[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]],\n", + "\n", + " [[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]],\n", + "\n", + " ...,\n", + "\n", + " [[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]],\n", + "\n", + " [[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]],\n", + "\n", + " [[nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " ...,\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan],\n", + " [nan, nan, nan, ..., nan, nan, nan]]])\n", + "Coordinates:\n", + " * Lat (Lat) float64 -89.75 -89.25 -88.75 -88.25 ... 88.75 89.25 89.75\n", + " * Lon (Lon) float64 -179.8 -179.2 -178.8 -178.2 ... 178.8 179.2 179.8\n", + " * Time (Time) datetime64[ns] 1901-01-31 1901-02-28 ... 2015-12-31" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out = xr.DataArray(np.zeros((nlat, nlon, nyears*nmonths)),\n", + " dims=(\"Lat\",\"Lon\",\"Time\"),\n", + " coords=({\"Lat\":Lat, \"Lon\":Lon, \"Time\":time}))\n", + "out[:] = np.nan\n", + "out" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First an example with 1 row to see what's happening.\n", + "In that case, we would loop over each row and find the location of the data in the dataarray. That would mean a loop over 6.8M items!\n", + "\n", + "We use the longitude, latitude and year information contain in each row to find the locations to update in the `out` array using `loc`. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "350 ms ± 5.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "row = next(df.iterrows())[1]\n", + "out.loc[dict( \n", + " Lon=row[\"Lon\"],\n", + " Lat=row[\"Lat\"],\n", + " Time=out.Time[(out.Time.dt.year==row[\"Year\"])])] = row[3:]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "array([0.04 , 0.039, 0.069, 0.144, 0.056, 0.022, 0.017, 0.005, 0.031,\n", + " 0.027, 0.132, 0.204, nan, nan, nan, nan, nan, nan,\n", + " nan, nan, nan, nan, nan, nan])\n", + "Coordinates:\n", + " Lat float64 -1.25\n", + " Lon float64 39.75\n", + " * Time (Time) datetime64[ns] 1901-01-31 1901-02-28 ... 1902-12-31\n" + ] + } + ], + "source": [ + "# Just a check\n", + "print(out.sel(Lon=39.75, Lat=-1.25, Time=slice(\"1901\", \"1902\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see only the data for the year 1901 has been updated for this point only in `out`.\n", + "\n", + "The different notation for Time and the longitude or latitude is because we only look for one longitude and one latitude but we need several time indexes.\n", + "\n", + "But this is way to slow. It's 308ms times the number of rows (6.8M). \n", + "\n", + "To improve that time, we have to remember all the years for each point are together in order. So we could read nyears at a time and get the data for all years and months at once.\n", + "\n", + "There is still the problem, if we select all years for a point at once, we get a 2D Dataframe with the years in rows and the months in columns. To solve this, we can simply stack the Dataframe over the months columns. Since Pandas doesn't scramble rows when stacking, we can stack the original Dataframe and then use the fact all points have data for the same number of years and months. If we had some missing years for some points, we could modify the following code to allow for it but it would be slower." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# We stack the months columns of the whole DataFrame at once since we\n", + "# don't have missing years for any point. Doing it once will save time.\n", + "df_stack = df[months].stack()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.07 ms ± 88.8 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "#Example for one spatial point\n", + "rows = df[0:nyears]\n", + "# If we had missing years, we could add the missing years rows and then\n", + "# stack only the rows for the point here.\n", + "#rows_stack = rows[months].stack()\n", + "out.loc[dict( \n", + " Lon=rows[\"Lon\"].min(),\n", + " Lat=rows[\"Lat\"].min())] = df_stack[0:nyears*nmonths]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If each pack is 2.07 ms, then for the whole DataFrame we need 2 min (see below) which is very good!" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2.042055" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df.index)//nyears*2.07/1000/60" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1min 5s, sys: 552 ms, total: 1min 5s\n", + "Wall time: 1min 5s\n" + ] + } + ], + "source": [ + "%%time\n", + "#df_stack = df[months].stack()\n", + "for nr in range(0,len(df.index),nyears):\n", + " rows = df[nr:nr+nyears]\n", + " thislon = rows[\"Lon\"].min()\n", + " thislat = rows[\"Lat\"].min()\n", + " out.loc[dict( \n", + " Lon=thislon,\n", + " Lat=thislat)] = df_stack[nr*nmonths:(nr+nyears)*nmonths]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And a plot to check." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "out.sel(Time=\"2015-01\").plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Another solution: groupby\n", + "In the loop above, we see we handle all the data for each spatial point at once. That's typically what groupby would do if applied to Lon and Lat. Once we group the data per spatial point, we need to apply a function. This function sould return the stacked data for that point (i.e. the timeseries) with the timestamp as an index. This way to overall DataFrame resulting from the groupby will have Lon, Lat and Time as indexes and 1 column of data. This column will be the timeseries at each point stack one after the other. Let's see what it would look like:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# We reuse the time array we used for defining the out DataArray\n", + "# time = pd.date_range(start=f'01/{years[0]}',\n", + "# end =f'01/{years[-1]+1}', freq='M')\n", + "\n", + "def time_to_date(df):\n", + " df_stack = df[months].stack()\n", + " return(pd.DataFrame(df_stack.values, index=time))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2min 8s, sys: 2.63 s, total: 2min 10s\n", + "Wall time: 2min 10s\n" + ] + } + ], + "source": [ + "%%time\n", + "df2 = df.groupby([\"Lon\",\"Lat\"]).apply(time_to_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
LonLatTime
-179.7566.251901-01-310.0
1901-02-280.0
1901-03-310.0
1901-04-300.0
1901-05-310.0
............
179.7571.252015-08-310.0
2015-09-300.0
2015-10-310.0
2015-11-300.0
2015-12-310.0
\n", + "

81682200 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " 0\n", + "Lon Lat Time \n", + "-179.75 66.25 1901-01-31 0.0\n", + " 1901-02-28 0.0\n", + " 1901-03-31 0.0\n", + " 1901-04-30 0.0\n", + " 1901-05-31 0.0\n", + "... ...\n", + " 179.75 71.25 2015-08-31 0.0\n", + " 2015-09-30 0.0\n", + " 2015-10-31 0.0\n", + " 2015-11-30 0.0\n", + " 2015-12-31 0.0\n", + "\n", + "[81682200 rows x 1 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.index.names = [\"Lon\", \"Lat\", \"Time\"]\n", + "df2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see the groupby calculation takes about the same time but we only get a DataFrame as output and not a DataArray. So we still need to convert this DataFrame to a DataArray. Unfortunately, the conversion below crashes the Jupyter Kernel. This is because the DataFrame contains only the landpoints. The DataArray created would be sparse with irregular coordinates. There are ways to solve this issue but we'll see those in another blog on sparse matrices as this one is long enough. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#out = df2.to_xarray()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "Although the code with `groupby` is a lot more compact, it might not be as intuitive to write at first. In this case, it shows a well-thought loop can actually be faster than a `groupby` as the conversion to a spatially full DataArray can be done at the same time as the concatenation of the months and years.\n", + "\n", + "At the opposite, after experience using Pandas and Xarray, the `groupby` solution might be the first one will think about. It isn't a bad solution but then one needs to know how to fill the missing indexes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Solution summary\n", + "Below is the solution with the loop containing only the necessary code in one cell to clarify what the final code would look like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import xarray as xr\n", + "\n", + "fname = \"/g/data/w35/lt0205/research/lpj_guess/runs/CRUNCEP/mgpp.out\"\n", + "df = pd.read_csv(fname, header=0, delim_whitespace=True)\n", + "\n", + "months=list(df.columns)\n", + "months=months[3:]\n", + "years = np.unique(df.Year)\n", + "nyears = len(years)\n", + "nmonths = 12\n", + "years.sort()\n", + "\n", + "# Create the axes\n", + "time = pd.date_range(start=f'01/{years[0]}',\n", + " end =f'01/{years[-1]+1}', freq='M')\n", + "# We'll use a generic way to create a regular grid from [-180,180] and\n", + "# [-90, 90] when knowing the resolution. Feel free to reuse as needed.\n", + "dx = 0.5\n", + "Lon = xr.DataArray(np.arange(-180.+dx/2., 180., dx), dims=(\"Lon\"),\n", + " attrs={\"long_name\":\"longitude\", \"unit\":\"degrees_east\"})\n", + "nlon = Lon.size\n", + "dy = 0.5\n", + "Lat = xr.DataArray(np.arange(-90.+dy/2., 90., dy), dims=(\"Lat\"),\n", + " attrs={\"long_name\":\"latitude\", \"unit\":\"degrees_north\"})\n", + "nlat = Lat.size\n", + "\n", + "# Output array\n", + "out = xr.DataArray(np.zeros((nlat, nlon, nyears*nmonths)),\n", + " dims=(\"Lat\",\"Lon\",\"Time\"),\n", + " coords=({\"Lat\":Lat, \"Lon\":Lon, \"Time\":time}))\n", + "out[:] = np.nan\n", + "\n", + "# We stack the months columns of the whole DataFrame at once since we\n", + "# don't have missing years for any point. Doing it once will save time.\n", + "df_stack = df[months].stack()\n", + "\n", + "for nr in range(0,len(df.index),nyears):\n", + " rows = df[nr:nr+nyears]\n", + " thislon = rows[\"Lon\"].min()\n", + " thislat = rows[\"Lat\"].min()\n", + " out.loc[dict( \n", + " Lon=thislon,\n", + " Lat=thislat)] = df_stack[nr*nmonths:(nr+nyears)*nmonths]\n", + " \n", + "out.sel(Time=\"2015-01\").plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}