Skip to content

Commit

Permalink
fix API month_mean (should not) mixed with area_mean method bug
Browse files Browse the repository at this point in the history
  • Loading branch information
cywhale committed Jul 18, 2024
1 parent 0891bab commit ce57289
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 59 deletions.
2 changes: 2 additions & 0 deletions API/change_log.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ ODB API of Marine Heatwaves
#### ver 0.2.2 fix API gridded function inconsistent with original PostGIS database bug

-- fix xr.open_mfdataset parallel=True cause core dump problem/append script using Geoserver api

#### ver 0.2.3 fix API month_mean (should not) mixed with area_mean method bug
201 changes: 143 additions & 58 deletions API/dev/sim_mhw_api01.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -51,14 +51,14 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Exe time: 1.660059928894043 sec\n"
"Exe time: 12.472715854644775 sec\n"
]
}
],
Expand All @@ -71,16 +71,25 @@
"\n",
"et = time.time()\n",
"print('Exe time: ', et-st, 'sec')\n",
"dz\n",
"dz\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"gridSz = 0.25\n",
"timeLimit = 365\n",
"LON_RANGE_LIMIT = 90\n",
"LAT_RANGE_LIMIT = 90"
"timeLimit = 366 * 10 #Entend timeLimit here\n",
"LON_RANGE_LIMIT = 360 #NO LIMIT here\n",
"LAT_RANGE_LIMIT = 180\n",
"AREA_LIMIT = LON_RANGE_LIMIT * LAT_RANGE_LIMIT"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -95,7 +104,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand All @@ -121,36 +130,36 @@
],
"source": [
"#For gridded function using round would be inconsistent with original PostGIS database\n",
"def to_nearest_grid_point(lon: float, lat: float) -> tuple:\n",
"def to_old_grid_point(lon: float, lat: float) -> tuple:\n",
" mlon = 180 if lon > 180 else (-180 if lon < -180 else lon)\n",
" mlat = 90 if lat > 90 else (-90 if lat < -90 else lat)\n",
" mlon = mlon + 360 if mlon < 0 else mlon\n",
" grid_lon = round(mlon * 4) / 4\n",
" grid_lat = round(mlat * 4) / 4\n",
" return (grid_lon, grid_lat)\n",
"\n",
"def to_lowest_grid_point(lon: float, lat: float) -> tuple:\n",
"def to_nearest_grid_point(lon: float, lat: float) -> tuple:\n",
" mlon = 180 if lon > 180 else (-180 if lon < -180 else lon)\n",
" mlat = 90 if lat > 90 else (-90 if lat < -90 else lat)\n",
" mlon = mlon + 360 if mlon < 0 else mlon\n",
" grid_lon = math.floor(mlon * 4) / 4\n",
" grid_lat = math.floor(mlat * 4) / 4\n",
" return (grid_lon, grid_lat)\n",
"\n",
"print(to_old_grid_point(123.97247, 24.70941))\n",
"print(to_old_grid_point(123.79669, 24.54213))\n",
"print(to_nearest_grid_point(123.97247, 24.70941))\n",
"print(to_nearest_grid_point(123.79669, 24.54213))\n",
"print(to_lowest_grid_point(123.97247, 24.70941))\n",
"print(to_lowest_grid_point(123.79669, 24.54213))\n",
"print(\"-------------\")\n",
"print(to_old_grid_point(124.81567, 25.04330))\n",
"print(to_old_grid_point(124.80469, 25.17512))\n",
"print(to_nearest_grid_point(124.81567, 25.04330))\n",
"print(to_nearest_grid_point(124.80469, 25.17512))\n",
"print(to_lowest_grid_point(124.81567, 25.04330))\n",
"print(to_lowest_grid_point(124.80469, 25.17512))\n",
"print(\"-------------\")\n",
"print(to_old_grid_point(-6.04794, -8.85251))\n",
"print(to_old_grid_point(-6.13586, -8.93934))\n",
"print(to_nearest_grid_point(-6.04794, -8.85251))\n",
"print(to_nearest_grid_point(-6.13586, -8.93934))\n",
"print(to_lowest_grid_point(-6.04794, -8.85251))\n",
"print(to_lowest_grid_point(-6.13586, -8.93934))\n"
"print(to_nearest_grid_point(-6.13586, -8.93934))\n"
]
},
{
Expand Down Expand Up @@ -179,7 +188,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -206,13 +215,14 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"#@app.get(\"/api/mhw\")\n",
"#async\n",
"def process_mhw_data(lon0: float, lat0: float, lon1: Optional[float], lat1: Optional[float], start: Optional[date], end: Optional[date], append: Optional[str], mode: Optional[str] = None):\n",
"def process_mhw_data(lon0: float, lat0: float, lon1: Optional[float], lat1: Optional[float], start: Optional[date], end: Optional[date], append: Optional[str], mode: Optional[str] = None, toPolars: Optional[bool] = True):\n",
" global gridSz, timeLimit, LON_RANGE_LIMIT, LAT_RANGE_LIMIT, AREA_LIMIT, dz\n",
" if mode is None:\n",
" mode = 'raw'\n",
"\n",
Expand Down Expand Up @@ -259,8 +269,8 @@
" # Only one point, no date range limitation\n",
" out_file = (out_file + '_' + deg2str(lon0, True, orig_lon0<=-179.875) + '_' + deg2str(lat0, False) +\n",
" '_' + 'Point' + '_' + str(start_date.date()) + '_' + str(end_date.date()))\n",
" data_subset = dz.sel(lon=slice(lon0, lon0+gridSz-0.01), lat=slice(\n",
" lat0, lat0+gridSz-0.01), date=slice(start_date, end_date))\n",
" data_subset = dz.sel(lon=slice(lon0, lon0+0.5*gridSz), lat=slice(\n",
" lat0, lat0+0.5*gridSz), date=slice(start_date, end_date))\n",
"\n",
" else:\n",
" # Bounding box, 1 month or 1 year date range limitation\n",
Expand All @@ -274,7 +284,7 @@
" if (lon_range > LON_RANGE_LIMIT and lat_range > LAT_RANGE_LIMIT) or (area_range > AREA_LIMIT):\n",
" # Large spatial range, limit to one month of data\n",
" end_date = start_date + pd.DateOffset(months=1) - timedelta(days=1)\n",
" elif (end_date - start_date).days > timeLimit:\n",
" elif (end_date - start_date).days >= timeLimit+1:\n",
" # Smaller spatial range, limit to one year of data\n",
" end_date = start_date + timedelta(days=timeLimit)\n",
"\n",
Expand All @@ -290,6 +300,7 @@
" lon0, lon1 = lon1, lon0\n",
" orig_lon0, orig_lon1 = orig_lon1, orig_lon0\n",
"\n",
" print(f\"Split to subset1,2: {lon0}-360, and 0-{lon1}, with {lat0}-{lat1}, {start_date}-{end_date}\")\n",
" subset1 = dz.sel(lon=slice(lon0, 360), lat=slice(lat0, lat1), date=slice(start_date, end_date))\n",
" subset2 = dz.sel(lon=slice(0, lon1), lat=slice(lat0, lat1), date=slice(start_date, end_date))\n",
" data_subset = xr.concat([subset1, subset2], dim='lon')\n",
Expand All @@ -309,8 +320,9 @@
" status_code=400, detail=\"No data available for the given parameters.\")\n",
"\n",
" data_subset.load()\n",
" print(\"Test data_subset: \", data_subset)\n",
"\n",
" if mode in ['area_mean_sst', 'area_mean_sst_anomaly', 'month_mean']:\n",
" if mode in ['area_mean_sst', 'area_mean_sst_anomaly']:\n",
" # Perform the averaging using xarray\n",
" st = time.time()\n",
" area_mean = data_subset.mean(dim=['lon', 'lat'], skipna=True).dropna(dim='date', how='all').compute()\n",
Expand All @@ -322,8 +334,21 @@
" et = time.time()\n",
" print('Exec avg and reset index time: ', et-st, 'sec')\n",
" print(df.loc[df['date'] >= pd.to_datetime('2023-01-01')])\n",
" pl_df = output_df(df, False)\n",
" if toPolars:\n",
" pl_df = output_df(df, False)\n",
" else:\n",
" pl_df = df \n",
"\n",
" elif mode == 'month_mean':\n",
" # Segment the data by month first\n",
" monthly_means = data_subset.groupby('date.month').mean(dim=['lon', 'lat'], skipna=True).dropna(\n",
" dim='date', how='all').compute(timeout='30s')\n",
" df = monthly_means[['date', append]].to_dataframe().reset_index()\n",
" if toPolars:\n",
" pl_df = output_df(df, False)\n",
" else:\n",
" pl_df = df \n",
" \n",
" else:\n",
" st = time.time()\n",
" df = data_subset.to_dataframe().reset_index()\n",
Expand All @@ -336,7 +361,10 @@
" variables].dropna(how='all', subset=variables)\n",
" et = time.time()\n",
" print('Drop na data time: ', et-st, 'sec') \n",
" pl_df = output_df(df, True) \n",
" if toPolars:\n",
" pl_df = output_df(df, False)\n",
" else:\n",
" pl_df = df \n",
"\n",
" # if 'date' in df.columns:\n",
" # df['date'] = df['date'].apply(\n",
Expand Down Expand Up @@ -374,11 +402,11 @@
"outputs": [],
"source": [
"### Global variables ###\n",
"gridSz = 0.25\n",
"timeLimit = 365\n",
"LON_RANGE_LIMIT = 90\n",
"LAT_RANGE_LIMIT = 90\n",
"AREA_LIMIT = LON_RANGE_LIMIT * LAT_RANGE_LIMIT\n",
"# gridSz = 0.25\n",
"# timeLimit = 365\n",
"# LON_RANGE_LIMIT = 90\n",
"# LAT_RANGE_LIMIT = 90\n",
"# AREA_LIMIT = LON_RANGE_LIMIT * LAT_RANGE_LIMIT\n",
"#async\n",
"def read_mhw(lon0: float, lat0: float, lon1: Optional[float] = None, lat1: Optional[float] = None, start: Optional[str] = None, end: Optional[str] = None, append: Optional[str] = None):\n",
" # Load your dataset\n",
Expand Down Expand Up @@ -431,44 +459,101 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Xarray subsetting time: 0.003977298736572266 sec\n",
"Reset index time: 0.01226043701171875 sec\n",
"Drop na data time: 0.008269786834716797 sec\n",
"Process date data time: 0.034552574157714844 sec\n",
"('sst_anomaly_105d0_5d0_135d0_35d0_BBOX_1982-01-01_1983-01-01', shape: (131_647, 4)\n",
"┌─────────┬────────┬────────────┬─────────────┐\n",
"│ lon ┆ lat ┆ date ┆ sst_anomaly │\n",
"│ --- ┆ --- ┆ --- ┆ --- │\n",
"│ f32 ┆ f32 ┆ str ┆ f64 │\n",
"╞═════════╪════════╪════════════╪═════════════╡\n",
"│ 105.125 ┆ 5.125 ┆ 1982-01-01 ┆ 0.1752 │\n",
"│ 105.375 ┆ 5.125 ┆ 1982-01-01 ┆ 0.0742 │\n",
"│ 105.625 ┆ 5.125 ┆ 1982-01-01 ┆ 0.0554 │\n",
"│ 105.875 ┆ 5.125 ┆ 1982-01-01 ┆ 0.0033 │\n",
"│ 106.125 ┆ 5.125 ┆ 1982-01-01 ┆ 0.0257 │\n",
"│ … ┆ … ┆ … ┆ … │\n",
"│ 131.125 ┆ 34.875 ┆ 1983-01-01 ┆ 0.1344 │\n",
"│ 131.375 ┆ 34.875 ┆ 1983-01-01 ┆ 0.3584 │\n",
"│ 131.625 ┆ 34.875 ┆ 1983-01-01 ┆ 0.5069 │\n",
"│ 131.875 ┆ 34.875 ┆ 1983-01-01 ┆ 0.5846 │\n",
"│ 132.125 ┆ 34.875 ┆ 1983-01-01 ┆ 0.6976 │\n",
"└─────────┴────────┴────────────┴─────────────┘)\n"
"Split to subset1,2: 280.0-360, and 0-0.0, with 0.0-60.0, 2024-01-01 00:00:00-2024-06-01 00:00:00\n",
"Xarray subsetting time: 0.015865087509155273 sec\n",
"Test data_subset: <xarray.Dataset> Size: 13MB\n",
"Dimensions: (date: 6, lat: 240, lon: 320)\n",
"Coordinates:\n",
" * date (date) datetime64[ns] 48B 2024-01-01 2024-02-01 ... 2024-06-01\n",
" * lat (lat) float32 960B 0.125 0.375 0.625 ... 59.38 59.62 59.88\n",
" * lon (lon) float32 1kB 280.1 280.4 280.6 280.9 ... 359.4 359.6 359.9\n",
"Data variables:\n",
" level (date, lat, lon) int64 4MB 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0\n",
" sst (date, lat, lon) float32 2MB nan nan nan ... 11.17 11.31 11.35\n",
" sst_anomaly (date, lat, lon) float64 4MB nan nan nan ... 0.9114 0.9248\n",
" td (date, lat, lon) float64 4MB nan nan nan nan ... nan nan nan\n",
"Reset index time: 0.03078770637512207 sec\n",
"Drop na data time: 0.017796993255615234 sec\n",
" lon lat date level sst\n",
"0 -79.875 0.125 2024-01-01 0 NaN\n",
"1 -79.625 0.125 2024-01-01 0 NaN\n",
"2 -79.375 0.125 2024-01-01 0 NaN\n",
"3 -79.125 0.125 2024-01-01 0 NaN\n",
"4 -78.875 0.125 2024-01-01 0 NaN\n",
"... ... ... ... ... ...\n",
"460795 -1.125 59.875 2024-06-01 0 10.567666\n",
"460796 -0.875 59.875 2024-06-01 0 10.894667\n",
"460797 -0.625 59.875 2024-06-01 0 11.166667\n",
"460798 -0.375 59.875 2024-06-01 0 11.312333\n",
"460799 -0.125 59.875 2024-06-01 0 11.350333\n",
"\n",
"[460800 rows x 5 columns]\n"
]
}
],
"source": [
"#await\n",
"dt = process_mhw_data(lon0=105.0,lon1=135.0,lat0=5.0,lat1=35.0,start='1982-01-01',end='2023-06-01',append='sst_anomaly',mode='area_mean_anomaly')\n",
"#dt = process_mhw_data(lon0=105.0,lon1=135.0,lat0=5.0,lat1=35.0,start='1982-01-01',end='2023-06-01',append='sst_anomaly',mode='area_mean_anomaly')\n",
"_,dt = process_mhw_data(lon0=-179.975,lon1=179.975,lat0=-60,lat1=60,start='2024-01-01',end='2024-06-01',append='sst,level',mode='', toPolars=False)\n",
"#_,dt = process_mhw_data(lon0=-80,lon1=0,lat0=0,lat1=60,start='2024-01-01',end='2024-06-01',append='sst',mode='month_mean', toPolars=True)\n",
"print(dt)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3046188\n"
]
}
],
"source": [
"print(len(dt))"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" sst\n",
"date \n",
"1 19.333866\n",
"2 18.960934\n",
"3 18.883202\n",
"4 19.210484\n",
"5 20.246025\n",
"6 21.754602\n"
]
}
],
"source": [
"Test_ICE = False\n",
"if Test_ICE:\n",
" dt1 = dt[dt['level'] != -1]\n",
"else:\n",
" dt1 = dt\n",
"\n",
"dt1.index = pd.to_datetime(dt1['date'])\n",
"dsst = dt1.groupby(dt1.index.month).agg({'sst': lambda x: x.mean(skipna=True)})\n",
"print(dsst)"
]
},
{
"cell_type": "code",
"execution_count": 14,
Expand Down Expand Up @@ -1146,7 +1231,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
"version": "3.11.4"
},
"orig_nbformat": 4,
"vscode": {
Expand Down
9 changes: 8 additions & 1 deletion API/src/mhw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ async def process_mhw_data(lon0: float, lat0: float, lon1: Optional[float], lat1

data_subset.load()

if mode in ['area_mean_sst', 'area_mean_sst_anomaly', 'month_mean']:
if mode in ['area_mean_sst', 'area_mean_sst_anomaly']:
# Perform the averaging using xarray
area_mean = data_subset.mean(dim=['lon', 'lat'], skipna=True).dropna(
dim='date', how='all').compute(timeout='30s')
Expand All @@ -184,6 +184,13 @@ async def process_mhw_data(lon0: float, lat0: float, lon1: Optional[float], lat1
# df.dropna(subset=[append], inplace=True)
pl_df = output_df(df, False)

elif mode == 'month_mean':
# Segment the data by month first
monthly_means = data_subset.groupby('date.month').mean(dim=['lon', 'lat'], skipna=True).dropna(
dim='date', how='all').compute(timeout='30s')
df = monthly_means[['date', append]].to_dataframe().reset_index()
pl_df = output_df(df, False)

else:
df = data_subset.to_dataframe().reset_index()
mask = df['lon'] > 180
Expand Down

0 comments on commit ce57289

Please sign in to comment.