Skip to content

Commit a8c2cd0

Browse files
committed
Merge remote-tracking branch 'origin/main' into preparation0.11.0
2 parents 5555a48 + f3e24e8 commit a8c2cd0

24 files changed

+843
-640
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,14 @@ Attention: this version is not compatible with catalog entries with ecCodes >= 2
1515
2. ecCodes temporarily restricted to < 2.34
1616

1717
AQUA core complete list:
18+
- Refactor the fdb-catalog-generator tool to work with data-portfolio repository (#1275)
19+
- Introduce a function to convert NetCDF to Zarr and zarr catalog entry for LRA (#1068)
20+
- Suppress the warning of missing catalogs in the AQUA console `add` command (#1288)
1821
- Lumi installation is completely updated to LUMI/23.09 modules (#1290)
1922
- gsv_intake switches eccodes also for shortname definitions (#1279)
2023
- Increase compatibility between LRA generator and multi-catalog (#1278)
2124
- Allow for intake string replacement within LRA-generated catalogs (#1278)
25+
- Avoid warning for missing intake variable default when calling the `Reader()` (#1287)
2226

2327
AQUA diagnostic complete list:
2428
- Teleconnections: catalog feature bugfix (#1276)

aqua/cli/main.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,12 @@ def execute(self):
6262

6363
# Set the log level
6464
if args.very_verbose or (args.verbose and args.very_verbose):
65-
loglevel = 'DEBUG'
65+
self.loglevel = 'DEBUG'
6666
elif args.verbose:
67-
loglevel = 'INFO'
67+
self.loglevel = 'INFO'
6868
else:
69-
loglevel = 'WARNING'
70-
self.logger = log_configure(loglevel, 'AQUA')
69+
self.loglevel = 'WARNING'
70+
self.logger = log_configure(self.loglevel, 'AQUA')
7171

7272
command = args.command
7373
method = self.command_map.get(command, parser_dict['main'].print_help)
@@ -229,7 +229,7 @@ def _install_default_diagnostics(self, diagnostic_type):
229229
sys.exit(1)
230230

231231
# Ensure the target directory exists using create_folder
232-
create_folder(target_directory, loglevel="WARNING")
232+
create_folder(target_directory, loglevel=self.loglevel)
233233

234234
if not os.path.exists(target_file):
235235
self.logger.debug('Copying from %s to %s', source_file, target_file)
@@ -267,7 +267,7 @@ def _install_editable_diagnostics(self, diagnostic_type, editable):
267267
sys.exit(1)
268268

269269
# Ensure the target directory exists using create_folder
270-
create_folder(target_directory, loglevel="WARNING")
270+
create_folder(target_directory, loglevel=self.loglevel)
271271

272272
if not os.path.exists(target_file):
273273
self.logger.debug('Linking from %s to %s', source_file, target_file)
@@ -389,7 +389,7 @@ def add(self, args):
389389
args (argparse.Namespace): arguments from the command line
390390
"""
391391
print('Adding the AQUA catalog', args.catalog)
392-
self._check()
392+
self._check(silent=True)
393393

394394
if args.editable is not None:
395395
self._add_catalog_editable(args.catalog, args.editable)
@@ -564,10 +564,12 @@ def remove_file(self, args):
564564
kind, file)
565565
sys.exit(1)
566566

567-
def _check(self):
567+
def _check(self, silent=False):
568568
"""check installation"""
569+
570+
checklevel = 'ERROR' if silent else self.loglevel
569571
try:
570-
self.configpath = ConfigPath().configdir
572+
self.configpath = ConfigPath(loglevel=checklevel).configdir
571573
self.configfile = os.path.join(self.configpath, 'config-aqua.yaml')
572574
self.logger.debug('AQUA found in %s', self.configpath)
573575
except FileNotFoundError:

aqua/lra_generator/lra_generator.py

+132-43
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,8 @@
1919
from aqua.util import create_folder, generate_random_string
2020
from aqua.util import dump_yaml, load_yaml
2121
from aqua.util import ConfigPath, file_is_complete
22-
from aqua.lra_generator.lra_util import move_tmp_files, replace_intake_vars
23-
24-
25-
#from aqua.lra_generator.lra_util import check_correct_ifs_fluxes
22+
from aqua.util import create_zarr_reference
23+
from aqua.lra_generator.lra_util import move_tmp_files, list_lra_files_complete, replace_intake_vars
2624

2725

2826
class LRAgenerator():
@@ -94,47 +92,51 @@ def __init__(self,
9492
self.logger.warning('IMPORTANT: no file will be created, this is a dry run')
9593

9694
self.nproc = int(nproc)
97-
self.tmpdir = tmpdir
95+
if tmpdir is None:
96+
self.logger.warning('No tmpdir specifield, will use outdir')
97+
self.tmpdir = os.path.join(outdir, 'tmp')
98+
else:
99+
self.tmpdir = tmpdir
100+
98101
if self.dask:
99102
self.logger.info('Running dask.distributed with %s workers', self.nproc)
100-
if not self.tmpdir:
101-
raise KeyError('Please specify tmpdir for dask.distributed.')
102103

103104
self.tmpdir = os.path.join(self.tmpdir, 'LRA_' +
104-
generate_random_string(10))
105+
generate_random_string(10))
105106

106-
if model:
107+
# safechecks
108+
if model is not None:
107109
self.model = model
108110
else:
109111
raise KeyError('Please specify model.')
110112

111-
if exp:
113+
if exp is not None:
112114
self.exp = exp
113115
else:
114116
raise KeyError('Please specify experiment.')
115117

116-
if source:
118+
if source is not None:
117119
self.source = source
118120
else:
119121
raise KeyError('Please specify source.')
120122

123+
if var is not None:
124+
self.var = var
125+
else:
126+
raise KeyError('Please specify variable string or list.')
127+
128+
if resolution is not None:
129+
self.resolution = resolution
130+
else:
131+
raise KeyError('Please specify resolution.')
132+
self.logger.info('Variable(s) to be processed: %s', self.var)
133+
121134
self.kwargs = kwargs
122135

123136
Configurer = ConfigPath(configdir=configdir)
124137
self.configdir = Configurer.configdir
125138
self.catalog = catalog
126139

127-
# Initialize variable(s)
128-
self.var = var
129-
130-
if not self.var:
131-
raise KeyError('Please specify variable string or list.')
132-
self.logger.info('Variable(s) to be processed: %s', self.var)
133-
134-
self.resolution = resolution
135-
if not self.resolution:
136-
raise KeyError('Please specify resolution.')
137-
138140
self.frequency = frequency
139141
if not self.frequency:
140142
self.logger.info('Frequency not specified, no time averagin will be performed.')
@@ -144,7 +146,7 @@ def __init__(self,
144146
'units': 'days since 1850-01-01 00:00:00',
145147
'calendar': 'standard',
146148
'dtype': 'float64'}
147-
149+
148150
self.var_encoding = {
149151
'dtype': 'float64',
150152
'zlib': True,
@@ -162,7 +164,10 @@ def __init__(self,
162164
self.last_record = None
163165
self.check = False
164166

165-
# Create LRA folder
167+
# Create LRA folders
168+
if outdir is None:
169+
raise KeyError('Please specify outdir.')
170+
166171
self.outdir = os.path.join(outdir, self.model, self.exp, self.resolution)
167172

168173
if self.frequency:
@@ -205,7 +210,7 @@ def retrieve(self):
205210

206211
self.logger.info('Retrieving data...')
207212
self.data = self.reader.retrieve(var=self.var)
208-
213+
209214
self.logger.debug(self.data)
210215

211216
def generate_lra(self):
@@ -223,10 +228,10 @@ def generate_lra(self):
223228

224229
else: # Only one variable
225230
self._write_var(self.var)
226-
231+
227232
self.logger.info('Move tmp files to output directory')
228233
move_tmp_files(self.tmpdir, self.outdir)
229-
234+
230235
# Cleaning
231236
self.data.close()
232237
self._close_dask()
@@ -282,6 +287,97 @@ def create_catalog_entry(self):
282287
# dump the update file
283288
dump_yaml(outfile=catalogfile, cfg=cat_file)
284289

290+
def create_zarr_entry(self, verify=True):
291+
"""
292+
Create a Zarr entry in the catalog for the LRA
293+
294+
Args:
295+
verify: open the LRA source and verify it can be read by the reader
296+
"""
297+
298+
entry_name = f'lra-{self.resolution}-{self.frequency}-zarr'
299+
full_dict, partial_dict = list_lra_files_complete(self.outdir)
300+
# full_dict, partial_dict = list_lra_files_vars(self.outdir)
301+
self.logger.info('Creating zarr files for %s %s %s', self.model, self.exp, entry_name)
302+
303+
# extra zarr only directory
304+
zarrdir = os.path.join(self.outdir, 'zarr')
305+
create_folder(zarrdir)
306+
307+
# this dictionary based structure is an overkill but guarantee flexibility
308+
urlpath = []
309+
for key, value in full_dict.items():
310+
jsonfile = os.path.join(zarrdir, f'lra-yearly-{key}.json')
311+
self.logger.debug('Creating zarr files for full files %s', key)
312+
if value:
313+
jsonfile = create_zarr_reference(value, jsonfile, loglevel=self.loglevel)
314+
if jsonfile is not None:
315+
urlpath = urlpath + [f'reference::{jsonfile}']
316+
317+
for key, value in partial_dict.items():
318+
jsonfile = os.path.join(zarrdir, f'lra-monthly-{key}.json')
319+
self.logger.debug('Creating zarr files for partial files %s', key)
320+
if value:
321+
jsonfile = create_zarr_reference(value, jsonfile, loglevel=self.loglevel)
322+
if jsonfile is not None:
323+
urlpath = urlpath + [f'reference::{jsonfile}']
324+
325+
if not urlpath:
326+
raise FileNotFoundError('No files found to create zarr reference')
327+
328+
# apply intake replacement: works on string need to loop on the list
329+
for index, value in enumerate(urlpath):
330+
urlpath[index] = replace_intake_vars(catalog=self.catalog, path=value)
331+
332+
# load, add the block and close
333+
catalogfile = os.path.join(self.configdir, 'catalogs', self.catalog,
334+
'catalog', self.model, self.exp + '.yaml')
335+
cat_file = load_yaml(catalogfile)
336+
337+
# if entry exists
338+
if entry_name in cat_file['sources']:
339+
340+
self.logger.info('Catalog entry for %s %s %s exists, updating the urlpath only...',
341+
self.model, self.exp, entry_name)
342+
cat_file['sources'][entry_name]['args']['urlpath'] = urlpath
343+
344+
else:
345+
self.logger.info('Creating zarr catalog entry %s %s %s', self.model, self.exp, entry_name)
346+
347+
# define the block to be uploaded into the catalog
348+
block_cat = {
349+
'driver': 'zarr',
350+
'description': f'LRA data {self.frequency} at {self.resolution} reference on zarr',
351+
'args': {
352+
'consolidated': False,
353+
'combine': 'by_coords',
354+
'urlpath': urlpath
355+
},
356+
'metadata': {
357+
'source_grid_name': 'lon-lat',
358+
},
359+
'fixer_name': False
360+
}
361+
cat_file['sources'][entry_name] = block_cat
362+
363+
dump_yaml(outfile=catalogfile, cfg=cat_file)
364+
365+
# verify the zarr entry makes sense
366+
if verify:
367+
self.logger.info('Verifying that zarr entry can be loaded...')
368+
try:
369+
reader = Reader(model=self.model, exp=self.exp, source='lra-r100-monthly-zarr')
370+
data = reader.retrieve()
371+
self.logger.info('Zarr entry successfully created!!!')
372+
except (KeyError, ValueError) as e:
373+
self.logger.error('Cannot load zarr LRA with error --> %s', e)
374+
self.logger.error('Zarr source is not accessible by the Reader likely due to irregular amount of NetCDF file')
375+
self.logger.error('To avoid issues in the catalog, the entry will be removed')
376+
self.logger.error('In case you want to keep it, please run with verify=False')
377+
cat_file = load_yaml(catalogfile)
378+
del cat_file['sources'][entry_name]
379+
dump_yaml(outfile=catalogfile, cfg=cat_file)
380+
285381
def _set_dask(self):
286382
"""
287383
Set up dask cluster
@@ -319,15 +415,15 @@ def _concat_var_year(self, var, year):
319415
from the same year
320416
"""
321417

322-
#infiles = os.path.join(self.outdir,
418+
# infiles = os.path.join(self.outdir,
323419
# f'{var}_{self.exp}_{self.resolution}_{self.frequency}_{year}??.nc')
324420
infiles = self.get_filename(var, year, month = '??')
325421
if len(glob.glob(infiles)) == 12:
326422
xfield = xr.open_mfdataset(infiles)
327423
self.logger.info('Creating a single file for %s, year %s...', var, str(year))
328424
outfile = self.get_filename(var, year)
329-
#outfile = os.path.join(self.tmpdir,
330-
# f'{var}_{self.exp}_{self.resolution}_{self.frequency}_{year}.nc')
425+
# outfile = os.path.join(self.tmpdir,
426+
# f'{var}_{self.exp}_{self.resolution}_{self.frequency}_{year}.nc')
331427
# clean older file
332428
if os.path.exists(outfile):
333429
os.remove(outfile)
@@ -338,7 +434,6 @@ def _concat_var_year(self, var, year):
338434
self.logger.info('Cleaning %s...', infile)
339435
os.remove(infile)
340436

341-
342437
def get_filename(self, var, year=None, month=None, tmp=False):
343438
"""Create output filenames"""
344439

@@ -437,7 +532,7 @@ def _remove_regridded(self, data):
437532
# continue
438533
# else:
439534
# self.logger.warning('Monthly file %s already exists, overwriting as requested...', outfile)
440-
535+
441536
# # real writing
442537
# if self.definitive:
443538
# self.write_chunk(temp_data, outfile)
@@ -480,7 +575,7 @@ def _write_var_catalog(self, var):
480575
for year in years:
481576

482577
self.logger.info('Processing year %s...', str(year))
483-
yearfile = self.get_filename(var, year = year)
578+
yearfile = self.get_filename(var, year=year)
484579

485580
# checking if file is there and is complete
486581
filecheck = file_is_complete(yearfile, loglevel=self.loglevel)
@@ -498,7 +593,7 @@ def _write_var_catalog(self, var):
498593
months = [months[0]]
499594
for month in months:
500595
self.logger.info('Processing month %s...', str(month))
501-
outfile = self.get_filename(var, year = year, month = month)
596+
outfile = self.get_filename(var, year=year, month=month)
502597

503598
# checking if file is there and is complete
504599
filecheck = file_is_complete(outfile, loglevel=self.loglevel)
@@ -511,12 +606,9 @@ def _write_var_catalog(self, var):
511606

512607
month_data = year_data.sel(time=year_data.time.dt.month == month)
513608

514-
#self.logger.debug(month_data.mean().values)
515-
#self.logger.debug(month_data)
516-
517609
# real writing
518610
if self.definitive:
519-
tmpfile = self.get_filename(var, year = year, month = month, tmp = True)
611+
tmpfile = self.get_filename(var, year=year, month=month, tmp=True)
520612
schunk = time()
521613
self.write_chunk(month_data, tmpfile)
522614
tchunk = time() - schunk
@@ -537,7 +629,7 @@ def _write_var_catalog(self, var):
537629
def write_chunk(self, data, outfile):
538630
"""Write a single chunk of data - Xarray Dataset - to a specific file
539631
using dask if required and monitoring the progress"""
540-
632+
541633
# update data attributes for history
542634
if self.frequency:
543635
log_history(data, f'regridded from {self.reader.src_grid_name} to {self.resolution} and from frequency {self.reader.orig_freq} to {self.frequency} through LRA generator')
@@ -576,13 +668,10 @@ def write_chunk(self, data, outfile):
576668
avg_mem = np.mean(array_data[:, 1])/1e9
577669
max_mem = np.max(array_data[:, 1])/1e9
578670
self.logger.info('Avg memory used: %.2f GiB, Peak memory used: %.2f GiB', avg_mem, max_mem)
579-
671+
580672
else:
581673
with ProgressBar():
582674
write_job.compute()
583675

584676
del write_job
585677
self.logger.info('Writing file %s successfull!', outfile)
586-
587-
588-

0 commit comments

Comments
 (0)