From c7de2faba1052fc9560e7d583055f068bd27c7d1 Mon Sep 17 00:00:00 2001 From: Friedemann Reum <freum@bgc-jena.mpg.de> Date: Tue, 14 Sep 2021 11:38:32 +0200 Subject: [PATCH] Added code for running with WRF-Chem instead of TM5 This is the initial commit for running CTDAS with WRF-Chem instead of TM5. The commit contains new files for observationoperator, statevector, rc files and WRF-specific tools. In addition, it contains new files with WRF-specific versions of pipeline and column observations. They are based on python 2 versions from mid 2019. In a future commit, the code will be updated to work with the current versions of these files in the master branch. However, in the case of column observations, this includes changes to the input file format. To facilitate migration from the original python 2 code to python 3, the old file (and thus format) is kept in this commit. --- da/observations/obs_column_wrfchem.py | 566 +++++++ .../observationoperator_wrfchem.py | 1400 +++++++++++++++++ da/pipelines/pipeline_wrfchem.py | 469 ++++++ da/rc/wrfchem/dasystem_wrfchem.rc | 77 + da/rc/wrfchem/obsoper_wrfchem.rc | 141 ++ da/statevectors/statevector_wrfchem.py | 332 ++++ da/tools/wrfchem/__init__.py | 0 da/tools/wrfchem/utilities.py | 319 ++++ da/tools/wrfchem/wrfchem_helper.py | 990 ++++++++++++ da/tools/wrfchem/wrfout_sampler.py | 246 +++ 10 files changed, 4540 insertions(+) create mode 100644 da/observations/obs_column_wrfchem.py create mode 100755 da/obsoperators/observationoperator_wrfchem.py create mode 100755 da/pipelines/pipeline_wrfchem.py create mode 100644 da/rc/wrfchem/dasystem_wrfchem.rc create mode 100644 da/rc/wrfchem/obsoper_wrfchem.rc create mode 100644 da/statevectors/statevector_wrfchem.py create mode 100644 da/tools/wrfchem/__init__.py create mode 100644 da/tools/wrfchem/utilities.py create mode 100644 da/tools/wrfchem/wrfchem_helper.py create mode 100644 da/tools/wrfchem/wrfout_sampler.py diff --git a/da/observations/obs_column_wrfchem.py b/da/observations/obs_column_wrfchem.py new file mode 100644 index 00000000..38fea18b --- /dev/null +++ b/da/observations/obs_column_wrfchem.py @@ -0,0 +1,566 @@ +"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +updates of the code. See also: http://www.carbontracker.eu. + +This program is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software Foundation, +version 3. This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this +program. If not, see <http://www.gnu.org/licenses/>.""" +#!/usr/bin/env python +# obs.py + +""" +Author : liesbeth, freum + +Revision History: +File created June 2019. + + On this version (freum): +- Handles column observations. Based on obs_column_xco2.py from sometime mid 2019. +- Since then, both versions have developed further. Since this also affects the file +- format, I leave it as is for now for backwards compatibility for users of the +- original python 2 code. + +Most important revisions in this file from original: +- Added pressure weighting function +- Added optional footprint corner coordinates to improve sampling of models with + resolutions finer than the satellite footprint. This functionality is not validated. + +Changes to (original) obs_column_xco2.py not yet implemented here as of 2021-07-10: +- def __init__: New arguments h=0.0 (pressure weighting function), moderr=0.0, loc_L=0 +- Update to model-data-mismatch approach +- Approach to localization implemented here +- def add_observations(self, advance=False): new argument "advance" +- dimlayers +- self.additional_variables + +""" +import os +import sys +import logging + +import datetime as dtm +import numpy as np +#from string import strip +from numpy import array, logical_and, sqrt +sys.path.append(os.getcwd()) +sys.path.append('../../') + +identifier = 'CarbonTracker total column-averaged mole fractions' +version = '0.0' + +from da.observations.obs_baseclass import Observations +import da.tools.io4 as io +import da.tools.rc as rc + + +################### Begin Class TotalColumnSample ################### + +class TotalColumnSample(object): + """ + Holds the data that defines a total column sample in the data assimilation framework. Sor far, this includes all + attributes listed below in the __init__ method. One can additionally make more types of data, or make new + objects for specific projects. + This file may contain OCO-2 specific parts... + """ + + def __init__(self, idx, codex, xdate, obs=0.0, simulated=0.0, lat=-999., lon=-999., mdm=None, prior=0.0, prior_profile=0.0, av_kernel=0.0, pressure=0.0, \ + ##### freum vvvv + pressure_weighting_function=None, + ##### freum ^^^^ + level_def = "pressure_boundary", psurf = float('nan'), resid=0.0, hphr=0.0, flag=0, species='co2', sdev=0.0, \ + ##### freum vvvv + latc_0=None, latc_1=None, latc_2=None, latc_3=None, lonc_0=None, lonc_1=None, lonc_2=None, lonc_3=None \ + ##### freum ^^^^ + ): + self.id = idx # Sounding ID + self.code = codex # Retrieval ID + self.xdate = xdate # Date of obs + self.obs = obs # Value observed + self.simulated = simulated # Value simulated by model, fillvalue = -9999 + self.lat = lat # Sample lat + self.lon = lon # Sample lon + ##### freum vvvv + self.latc_0 = latc_0 # Sample latitude corner + self.latc_1 = latc_1 # Sample latitude corner + self.latc_2 = latc_2 # Sample latitude corner + self.latc_3 = latc_3 # Sample latitude corner + self.lonc_0 = lonc_0 # Sample longitude corner + self.lonc_1 = lonc_1 # Sample longitude corner + self.lonc_2 = lonc_2 # Sample longitude corner + self.lonc_3 = lonc_3 # Sample longitude corner + ##### freum ^^^^ + self.mdm = mdm # Model data mismatch + self.prior = prior # A priori column value used in retrieval + self.prior_profile = prior_profile # A priori profile used in retrieval + self.av_kernel = av_kernel # Averaging kernel + self.pressure = pressure # Pressure levels of retrieval + # freum vvvv + self.pressure_weighting_function = pressure_weighting_function # Pressure weighting function + # freum ^^^^ + self.level_def = level_def # Are prior and averaging kernel defined as layer averages? + self.psurf = psurf # Surface pressure (only needed if level_def is "layer_average") + self.loc_L = int(0) # freum 2021-07-13: insert this dummy value so the code runs with the current version of CTDAS. *Should* not affect results if localizetype == "CT2007" as in all my runs. However, replace this file with the standard observation file, obs_column_xco2.py + + self.resid = resid # Mole fraction residuals + self.hphr = hphr # Mole fraction prior uncertainty from fluxes and (HPH) and model data mismatch (R) + self.may_localize = True # Whether sample may be localized in optimizer + self.may_reject = True # Whether sample may be rejected if outside threshold + self.flag = flag # Flag + self.sdev = sdev # standard deviation of ensemble + self.species = species.strip() + + +################### End Class TotalColumnSample ################### + + +################### Begin Class TotalColumnObservations ################### + +class TotalColumnObservations(Observations): + """ An object that holds data + methods and attributes needed to manipulate column samples + """ + + def setup(self, dacycle): + + self.startdate = dacycle['time.sample.start'] + self.enddate = dacycle['time.sample.end'] + + # Path to the input data (daily files) + sat_files = dacycle.dasystem['obs.column.ncfile'].split(',') + sat_dirs = dacycle.dasystem['obs.column.input.dir'].split(',') + + self.sat_dirs = [] + self.sat_files = [] + for i in range(len(sat_dirs)): + if not os.path.exists(sat_dirs[i].strip()): + msg = 'Could not find the required satellite input directory (%s) ' % sat_dirs[i] + logging.error(msg) + raise IOError(msg) + else: + self.sat_dirs.append(sat_dirs[i].strip()) + self.sat_files.append(sat_files[i].strip()) + del i + + # Get observation selection criteria (if present): + if 'obs.column.selection.variables' in dacycle.dasystem.keys() and 'obs.column.selection.criteria' in dacycle.dasystem.keys(): + self.selection_vars = dacycle.dasystem['obs.column.selection.variables'].split(',') + self.selection_criteria = dacycle.dasystem['obs.column.selection.criteria'].split(',') + logging.debug('Data selection criteria found: %s, %s' %(self.selection_vars, self.selection_criteria)) + else: + self.selection_vars = [] + self.selection_criteria = [] + logging.info('No data observation selection criteria found, using all observations in file.') + + # Model data mismatch approach + self.mdm_calculation = dacycle.dasystem.get('mdm.calculation') + logging.debug('mdm.calculation = %s' %self.mdm_calculation) + if not self.mdm_calculation in ['parametrization','empirical','no_transport_error']: + logging.warning('No valid model data mismatch method found. Valid options are \'parametrization\' and \'empirical\'. ' + \ + 'Using a constant estimate for the model uncertainty of 1ppm everywhere.') + else: + logging.info('Model data mismatch approach = %s' %self.mdm_calculation) + + # Path to file with observation error settings for column observations + # Currently the same settings for all assimilated retrieval products: should this be one file per product? + if not os.path.exists(dacycle.dasystem['obs.column.rc']): + msg = 'Could not find the required column observation .rc input file (%s) ' % dacycle.dasystem['obs.column.rc'] + logging.error(msg) + raise IOError(msg) + else: + self.obs_file = (dacycle.dasystem['obs.column.rc']) + + self.datalist = [] + + # Switch to indicate whether simulated column samples are read from obsOperator output, + # or whether the sampling is done within CTDAS (in obsOperator class) + self.sample_in_ctdas = dacycle.dasystem['sample.in.ctdas'] if 'sample.in.ctdas' in dacycle.dasystem.keys() else False + logging.debug('sample.in.ctdas = %s' % self.sample_in_ctdas) + + + + def get_samples_type(self): + return 'column' + + + + def add_observations(self): + """ Reading of total column observations, and selection of observations that will be sampled and assimilated. + + """ + + # Read observations from daily input files + for i in range(len(self.sat_dirs)): + + logging.info('Reading observations from %s' %os.path.join(self.sat_dirs[i],self.sat_files[i])) + + infile0 = os.path.join(self.sat_dirs[i], self.sat_files[i]) + ndays = 0 + + while self.startdate+dtm.timedelta(days=ndays) <= self.enddate: + + infile = infile0.replace("<YYYYMMDD>",(self.startdate+dtm.timedelta(days=ndays)).strftime("%Y%m%d")) + + if os.path.exists(infile): + + logging.info("Reading observations for %s" % (self.startdate+dtm.timedelta(days=ndays)).strftime("%Y%m%d")) + len_init = len(self.datalist) + + # get index of observations that satisfy selection criteria (based on variable names and values in system rc file, if present) + ncf = io.ct_read(infile, 'read') + if self.selection_vars: + selvars = [] + for j in self.selection_vars: + selvars.append(ncf.get_variable(j.strip())) + del j + criteria = [] + for j in range(len(self.selection_vars)): + criteria.append(eval('selvars[j]'+self.selection_criteria[j])) + del j + #criteria = [eval('selvars[i]'+self.selection_criteria[i]) for i in range(len(self.selection_vars))] + subselect = np.logical_and.reduce(criteria).nonzero()[0] + else: + subselect = np.arange(ncf.get_variable('sounding_id').size) + + # retrieval attributes + code = ncf.get_attribute('retrieval_id') + level_def = ncf.get_attribute('level_def') + + # only read good quality observations + ids = ncf.get_variable('sounding_id').take(subselect, axis=0) + lats = ncf.get_variable('latitude').take(subselect, axis=0) + lons = ncf.get_variable('longitude').take(subselect, axis=0) + obs = ncf.get_variable('obs').take(subselect, axis=0) + unc = ncf.get_variable('uncertainty').take(subselect, axis=0) + dates = ncf.get_variable('date').take(subselect, axis=0) + dates = array([dtm.datetime(*d) for d in dates]) + av_kernel = ncf.get_variable('averaging_kernel').take(subselect, axis=0) + prior_profile = ncf.get_variable('prior_profile').take(subselect, axis=0) + pressure = ncf.get_variable('pressure_levels').take(subselect, axis=0) + + prior = ncf.get_variable('prior').take(subselect, axis=0) + + ##### freum vvvv + pwf = ncf.get_variable('pressure_weighting_function').take(subselect, axis=0) + + # Additional variable surface pressure in case the profiles are defined as layer averages + if level_def == "layer_average": + psurf = ncf.get_variable('surface_pressure').take(subselect, axis=0) + else: + psurf = [float('nan')]*len(ids) + + # Optional: footprint corners + latc = dict( + latc_0=[float('nan')]*len(ids), + latc_1=[float('nan')]*len(ids), + latc_2=[float('nan')]*len(ids), + latc_3=[float('nan')]*len(ids)) + lonc = dict( + lonc_0=[float('nan')]*len(ids), + lonc_1=[float('nan')]*len(ids), + lonc_2=[float('nan')]*len(ids), + lonc_3=[float('nan')]*len(ids)) + # If one footprint corner variable is there, assume + # all are there. That's the only case that makes sense + if 'latc_0' in list(ncf.variables.keys()): + latc['latc_0'] = ncf.get_variable('latc_0').take(subselect, axis=0) + latc['latc_1'] = ncf.get_variable('latc_1').take(subselect, axis=0) + latc['latc_2'] = ncf.get_variable('latc_2').take(subselect, axis=0) + latc['latc_3'] = ncf.get_variable('latc_3').take(subselect, axis=0) + lonc['lonc_0'] = ncf.get_variable('lonc_0').take(subselect, axis=0) + lonc['lonc_1'] = ncf.get_variable('lonc_1').take(subselect, axis=0) + lonc['lonc_2'] = ncf.get_variable('lonc_2').take(subselect, axis=0) + lonc['lonc_3'] = ncf.get_variable('lonc_3').take(subselect, axis=0) + ###### freum ^^^^ + + ncf.close() + + # Add samples to datalist + # Note that the mdm is initialized here equal to the measurement uncertainty. This value is used in add_model_data_mismatch to calculate the mdm including model error + for n in range(len(ids)): + # Check for every sounding if time is between start and end time (relevant for first and last days of window) + if self.startdate <= dates[n] <= self.enddate: + self.datalist.append(TotalColumnSample(ids[n], code, dates[n], obs[n], None, lats[n], lons[n], unc[n], prior[n], prior_profile[n,:], \ + av_kernel=av_kernel[n,:], pressure=pressure[n,:], pressure_weighting_function=pwf[n,:],level_def=level_def,psurf=psurf[n], + ##### freum vvvv + latc_0=latc['latc_0'][n], latc_1=latc['latc_1'][n], latc_2=latc['latc_2'][n], latc_3=latc['latc_3'][n], + lonc_0=lonc['lonc_0'][n], lonc_1=lonc['lonc_1'][n], lonc_2=lonc['lonc_2'][n], lonc_3=lonc['lonc_3'][n] + ##### freum ^^^^ + )) + + logging.debug("Added %d observations to the Data list" % (len(self.datalist)-len_init)) + + ndays += 1 + + del i + + if len(self.datalist) > 0: + logging.info("Observations list now holds %d values" % len(self.datalist)) + else: + logging.info("No observations found for sampling window") + + + + def add_model_data_mismatch(self, filename=None, advance=False): + """ This function is empty: model data mismatch calculation is done during sampling in observation operator (TM5) to enhance computational efficiency + (i.e. to prevent reading all soundings twice and writing large additional files) + + """ + obs_data = rc.read(self.obs_file) + self.rejection_threshold = int(obs_data['obs.rejection.threshold']) + + # At this point mdm is set to the measurement uncertainty only, added in the add_observations function. + # Here this value is used to set the combined mdm by adding an estimate for the model uncertainty as a sum of squares. + if len(self.datalist) <= 1: return #== 0: return + for obs in self.datalist: + # parametrization as used by Frederic Chevallier + if self.mdm_calculation == 'parametrization': + obs.mdm = ( obs.mdm*obs.mdm + (0.8*np.exp((90.0+obs.lat)/300.0))**2 )**0.5 + # empirical approach of Andy Jacobson, TO BE IMPLEMENTED + # elif self.mdm_calculation == 'empirical': + # obs.mdm = ... + elif self.mdm_calculation == 'no_transport_error': + pass + else: # assume general model uncertainty of 1 ppm (arbitrary value) + obs.mdm = ( obs.mdm*obs.mdm + 1**2 )**0.5 + del obs + + meanmdm = np.average(np.array( [obs.mdm for obs in self.datalist] )) + logging.debug('Mean MDM = %s' %meanmdm) + + + + def add_simulations(self, filename, silent=False): + """ Adds observed and model simulated column values to the mole fraction objects + This function includes the add_observations and add_model_data_mismatch functionality for the sake of computational efficiency + + """ + + if self.sample_in_ctdas: + logging.debug("CODE TO ADD SIMULATED SAMPLES TO DATALIST TO BE ADDED") + + else: + # read simulated samples from file + if not os.path.exists(filename): + msg = "Sample output filename for observations could not be found : %s" % filename + logging.error(msg) + logging.error("Did the sampling step succeed?") + logging.error("...exiting") + raise IOError(msg) + + ncf = io.ct_read(filename, method='read') + ids = ncf.get_variable('sounding_id') + simulated = ncf.get_variable('column_modeled') + ncf.close() + logging.info("Successfully read data from model sample file (%s)" % filename) + + obs_ids = self.getvalues('id').tolist() + + missing_samples = [] + + # Match read simulated samples with observations in datalist + logging.info("Adding %i simulated samples to the data list..." % len(ids)) + for i in range(len(ids)): + # Assume samples are in same order in both datalist and file with simulated samples... + if ids[i] == obs_ids[i]: + self.datalist[i].simulated = simulated[i] + # If not, find index of current sample + elif ids[i] in obs_ids: + index = obs_ids.index(ids[i]) + # Only add simulated value to datalist if sample has not been filled before. Otherwise: exiting + if self.datalist[index].simulated is not None: + msg = 'Simulated and observed samples not in same order, and duplicate sample IDs found.' + logging.error(msg) + raise IOError(msg) + else: + self.datalist[index].simulated = simulated[i] + else: + logging.debug('added %s to missing_samples, obs id = %s' %(ids[i],obs_ids[i])) + missing_samples.append(ids[i]) + del i + + if not silent and missing_samples != []: + logging.warning('%i Model samples were found that did not match any ID in the observation list. Skipping them...' % len(missing_samples)) + + # if number of simulated samples < observations: remove observations without samples + if len(simulated) < len(self.datalist): + test = len(self.datalist) - len(simulated) + logging.warning('%i Observations were not sampled, removing them from datalist...' % test) + for index in reversed(list(range(len(self.datalist)))): + if self.datalist[index].simulated is None: + del self.datalist[index] + del index + + logging.debug("%d simulated values were added to the data list" % (len(ids) - len(missing_samples))) + + + + def write_sample_coords(self, obsinputfile): + """ + Write empty sample_coords_file if soundings are present in time interval, just such that general pipeline code does not have to be changed... + """ + + if self.sample_in_ctdas: + return + + if len(self.datalist) <= 1: #== 0: + logging.info("No observations found for this time period, no obs file written") + return + + # write data required by observation operator for sampling to file + f = io.CT_CDF(obsinputfile, method='create') + logging.debug('Creating new observations file for ObservationOperator (%s)' % obsinputfile) + + dimsoundings = f.add_dim('soundings', len(self.datalist)) + dimdate = f.add_dim('epoch_dimension', 7) + dimchar = f.add_dim('char', 20) + if len(self.datalist) == 1: + dimlevels = f.add_dim('levels', len(self.getvalues('pressure'))) + # freum: inserted but commented Liesbeth's new code for layers for reference, + # but I handle them differently. + # if len(self.getvalues('av_kernel')) != len(self.getvalues('pressure')): + # dimlayers = f.add_dim('layers',len(self.getvalues('av_kernel'))) + # layers = True + # else: layers = False + else: + dimlevels = f.add_dim('levels', self.getvalues('pressure').shape[1]) + # if self.getvalues('av_kernel').shape[1] != self.getvalues('pressure').shape[1]: + # dimlayers = f.add_dim('layers', self.getvalues('pressure').shape[1] - 1) + # layers = True + # else: layers = False + + savedict = io.std_savedict.copy() + savedict['dtype'] = "int64" + savedict['name'] = "sounding_id" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('id').tolist() + f.add_data(savedict) + + data = [[d.year, d.month, d.day, d.hour, d.minute, d.second, d.microsecond] for d in self.getvalues('xdate') ] + savedict = io.std_savedict.copy() + savedict['dtype'] = "int" + savedict['name'] = "date" + savedict['dims'] = dimsoundings + dimdate + savedict['values'] = data + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "latitude" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lat').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "longitude" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lon').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "prior" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('prior').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "prior_profile" + savedict['dims'] = dimsoundings + dimlevels + savedict['values'] = self.getvalues('prior_profile').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "averaging_kernel" + savedict['dims'] = dimsoundings + dimlevels + savedict['values'] = self.getvalues('av_kernel').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "pressure_levels" + savedict['dims'] = dimsoundings + dimlevels + savedict['values'] = self.getvalues('pressure').tolist() + f.add_data(savedict) + + # freum vvvv + savedict = io.std_savedict.copy() + savedict['name'] = "pressure_weighting_function" + savedict['dims'] = dimsoundings + dimlevels + savedict['values'] = self.getvalues('pressure_weighting_function').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "latc_0" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('latc_0').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "latc_1" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('latc_1').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "latc_2" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('latc_2').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "latc_3" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('latc_3').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "lonc_0" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lonc_0').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "lonc_1" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lonc_1').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "lonc_2" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lonc_2').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "lonc_3" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lonc_3').tolist() + f.add_data(savedict) + + # freum ^^^^ + + savedict = io.std_savedict.copy() + savedict['dtype'] = "char" + savedict['name'] = "level_def" + savedict['dims'] = dimsoundings + dimchar + savedict['values'] = self.getvalues('level_def').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "psurf" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('psurf').tolist() + f.add_data(savedict) + + f.close() + + + + + +################### End Class TotalColumnObservations ################### + + +if __name__ == "__main__": + pass diff --git a/da/obsoperators/observationoperator_wrfchem.py b/da/obsoperators/observationoperator_wrfchem.py new file mode 100755 index 00000000..a2e9bc2f --- /dev/null +++ b/da/obsoperators/observationoperator_wrfchem.py @@ -0,0 +1,1400 @@ +"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +updates of the code. See also: http://www.carbontracker.eu. + +This program is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software Foundation, +version 3. This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this +program. If not, see <http://www.gnu.org/licenses/>. + +Author : Friedemann Reum + +Revision History: +File created on 26 March 2019. + +This module holds functions needed to use the WRF-Chem model within CTDAS. It +is based on the TM5 observation operator by Wouter Peters. + +The input is an rc file that defines a source directory and a run directory. +The source directory is a directory in which wrf.exe can be called +(e.g. WRFV3/run/). I.e., it assumes that WRF-Chem was compiled, all +preprocessing was done and the outputs linked to the run +directory, and the correct namelist.input is there. namelist.input will +be modified in the run directory (e.g. start- and end times adapted to +each cycle). +""" + +# Instructions for pylint (e.g. for code analysis in Spyder): +# pylint: disable=too-many-instance-attributes +# pylint: disable=W0201 +# pylint: disable=C0301 + +# Python libraries +import os +import logging +import shutil +import subprocess +import glob +import re +import datetime as dtm +import numpy as np +import netCDF4 as nc +import f90nml + +# CTDAS modules +import da.tools.rc as rc +from da.tools.general import create_dirs +from da.obsoperators.observationoperator_baseclass import ObservationOperator +from da.tools.wrfchem.wrfchem_helper import WRFChemHelper +from da.tools.wrfchem.utilities import utilities + +identifier = "wrfchem" +version = "1.0" + +class WRFChemOO(ObservationOperator): + """This class holds methods and variables that are needed to run the WRF-Chem model.""" + + ########## Public methods + + def __init__(self, rc_filename, dacycle=None): + """Initialize WRFChemOO using settings in rc_filename.""" + + # Identify yourself + self.ID = identifier + self.version = version + + # Load settings + self._load_rc(rc_filename) + self._validate_rc() + + # A hardcoded setting (doesnt make sense to allow user to change + # it) + self.settings["original_save_suffix"] = ".original" + + # Instantiate a WRFChemHelper object + self.wrfhelper = WRFChemHelper(self.settings) + self.wrfhelper.validate_settings(["run_dir"]) + + logging.info("Observation Operator initialized: %s (%s)", + self.ID, self.version) + + + def setup(self, dacycle): + """Prepare the observation operator for running. + + This includes reading settings and preparing the run directory. + """ + + # Attach the CycleControl object (used for exchanging + # information between classes) + self.dacycle = dacycle + + # Infer whether initial and boundary conditions will be + # optimized. Only a constant offset can be optimized, + # hence the name. + self.dacycle.dasystem["do_offset"] = \ + "sigma_offset" in list(self.dacycle.dasystem.keys()) + + # Set up the directory where WRF is to be run. + # Only do this if you are the first data assimilation cycle. The + # WRF run directory is reused in subsequent runs to save both + # space and time. + if self.dacycle["time.restart"] is False: + logging.info("This is the first data assimilation cycle.") + + # Not overwriting existing files, so that if a run is + # started again (e.g. when it fails), the existing files can + # be used. This makes sense because setting up the directory + # can take a long time because WRF input files for ensemble + # runs are large. + overwrite = False + + self._create_fresh_run_dir(overwrite) + + logging.info("Fresh run dir for WRFChem created: %s", + self.settings["run_dir"]) + + # Read WRF namelist (= configuration file) + self.namelist = self.wrfhelper.read_namelist(self.settings["run_dir"]) + + # Also tell wrfhelper about the namelist. + self.wrfhelper.namelist = self.namelist + + # Some setup tasks only for the first data assimilation cycle. + if self.dacycle["time.restart"] is False: + # Prepare fluxes files for modification by optimizer + self._prepare_flux_files(overwrite) + + # In case initial/boundary conditions are optimized, save + # original wrfinput and wrfbdy files. They serve as + # sources for prior ic/bc. + if self.dacycle.dasystem["do_offset"]: + wrfinput = self.wrfhelper.get_wrf_filenames("wrfinput_d*") + wrfbdy = self.wrfhelper.get_wrf_filenames("wrfbdy_d01") + self._save_original_files(wrfinput + wrfbdy, overwrite) + logging.info("Original initial- and boundary " + \ + "conditions files saved.") + + logging.info("Setup done.") + + + def get_initial_data(self): + """Nothing to do, all preparations moved to _prepare_run""" + pass + + + def run_forecast_model(self, samples=None): + """Run the transport model and sample simulated observations""" + + self._prepare_run(samples) + self._validate_input() + self._run() + self._sample_simulated_obs(samples) + self._save_data() + + + def write_members_to_file(self, lag, outdir, statevector):#,endswith=".nc"): + """Write ensemble member information to file for WRFChem + + This writes fluxes in wrfchemi* files for one lag. This is based + on the statevector object, which is passed as argument, the + regionsfile and the namelist (for dealing with nests). + + In case an offset is optimized, wrfinput and wrfbdy files are + also modified accordingly. + + "outdir" is not used. + """ + + # Some abbreviations: + + # Data to write + members = statevector.ensemble_members[lag] + add_flux_fix = "flux_fix" in list(self.settings.keys()) + # Number of flux parameters per flux process. + # freum 2021-07-11: Use statevector.nparams_proc. This here + # would have been wrong if an offset was included and multiple + # emission processes used. + # nparams = int(self.dacycle.dasystem["nparameters"]) + # nparams_proc = nparams/int(self.dacycle.dasystem["n_emis_proc"]) + nparams_proc = statevector.nparams_proc + # Number of flux processes + n_emis_proc = int(self.dacycle.dasystem["n_emis_proc"]) + # WRF configuration + run_dir = self.settings["run_dir"] + n_domains = self.namelist["domains"]["max_dom"] + nmember_format = "%03d" + + + # Get member parameter maps for each domain + param_map_mem_dom = list() + for nproc in range(n_emis_proc): + # New parameter map list for this flux process + param_map_mem_dom.append(list()) + for mem in members: + # Get full parameter map for this member + param_map_mem = statevector.vector2grid( + mem.param_values[(nparams_proc*nproc):(nparams_proc*(nproc+1))]) + # Get domain-specific parameter maps from full map + nm = mem.membernumber + param_map_mem_dom[nproc].append(list()) + for domain in range(1, n_domains+1): + param_map_mem_dom[nproc][nm].append(self._sample_domain_map(param_map_mem, domain)) + + # Write fluxes for each domain: + + # Times that correspond to this lag + time_lag_start = self.dacycle["time.sample.start"] + time_lag_end = self.dacycle["time.sample.end"] + + for domain in range(1, n_domains+1): # WRF domain numbers start at 1 + + # Get the start times of wrfchemi files that belong to this lag + # First, get all times + file_times = self.wrfhelper.wrf_filename_times("wrfchemi_d%02d_"%domain) + file_times.sort() + # Then filter for the current lag. + file_times_lag = [file_times[i] for i in range(len(file_times)) + if + file_times[i] >= time_lag_start + and file_times[i] < time_lag_end + ] + + # Loop over flux files + for file_time in file_times_lag: + # Get file path + fn = "".join(["wrfchemi_d%02d_" % domain, + file_time.strftime("%Y-%m-%d_%H:%M:%S")]) + fp = os.path.join(run_dir, fn) + + # Path to original file + ofp = fp + self.settings["original_save_suffix"] + + # Which times in file to modify? + # Warning: I always use flux files with only one time + # index, so this part isn't tested. + + # First, get all times + src = nc.Dataset(ofp, "r") + flux_times = self.wrfhelper.times_in_wrf_file(src) + # Then filter for the current lag + indices = [i for i in range(len(flux_times)) + if + flux_times[i] >= time_lag_start + and flux_times[i] < time_lag_end + ] + # And finally make indices for slicing + # This assumes that flux times are monotonically + # increasing, so check that. + if np.any(np.diff(indices) != 1): + raise ValueError("Times in flux file " + \ + "'%s' " % ofp + \ + "are not monotonically increasing") + i0 = indices[0] + i1 = indices[-1] + 1 + + logging.debug("Writing fluxes to file %s, " % fp + \ + "time index slice %d:%d" %(i0, i1)) + + # Open destination file + dst = nc.Dataset(fp, "r+") + + # Read prior fluxes + prefix = self.settings["flux_optim"] + src_field = list() + for nproc in range(n_emis_proc): + if n_emis_proc==1: + src_varname = prefix + else: + src_varname = prefix + "_proc_" + str(nproc+1) + src_var = src.variables[src_varname] + src_field.append(src_var[i0:i1, :, :, :]) + + # Read fixed fluxes (if required) + if add_flux_fix: + fix_var = src.variables[self.settings["flux_fix"]] + fix_field = fix_var[i0:i1, :, :, :] + + # Write fluxes for every ensemble member + for mem in members: + + # Test: + # For testing the orienation of param_map_domain and + # src_field, make the bottom right corner 1. + # param_map_domain[0, -1] = 1 + # src_field[0, 0, 0, -1] = 1 # This was before n_emis_proc + + # Multiply prior fluxes with statevector map + field_state = np.zeros_like(src_field[0]) + nm = mem.membernumber + for nproc in range(n_emis_proc): + for nt in range(field_state.shape[0]): + for nz in range(field_state.shape[1]): + field_state[nt, nz, :, :] = ( + field_state[nt, nz, :, :] + + src_field[nproc][nt, nz, :, :] + * param_map_mem_dom[nproc][nm][domain-1][:] + ) + + # For testing whether the domain map is correct: + # field_state[nt, nz, :, :] = param_map_domain[:] + + # Add fixed fluxes + if add_flux_fix: + field_state += fix_field + + # Write to file handle (written to file when + # dst.close() is called) + varname = prefix + "_" + nmember_format % nm + dst.variables[varname][i0:i1, :, :, :] = field_state[:] + + src.close() + dst.close() + + # For debugging, copy dst dataset + #path = os.path.join(self.dacycle["dir.da_run"], "output") + #time_lag_start = self.dacycle["time.sample.start"] + #timestamp = time_lag_start.strftime("_lag%Y%m%d") + #self.wrfhelper.save_file_with_timestamp(fp, path, timestamp) + + # Add offset to wrfinput and wrfbdy + # Note: Everything but a constant offset would impact _BTXE + # and so on in wrfbdy. So only implementing a constant offset. + if self.dacycle.dasystem["do_offset"]: + if self.dacycle["time.restart"]: + logging.warning( + "do_offset = True and time.restart = True. Is " + \ + "this an advance step? If not, you are " + \ + "attempting to optimize initial and boundary " + \ + "conditions in a run with more than one cycle. " + \ + "This would produce an inconsistent result, " + \ + "because initial conditions also impact " + \ + "observations in previous cycles, but this " + \ + "isn't accounted for. Skipping." + ) + else: + # Write wrfbdy + fp = os.path.join(self.settings["run_dir"], "wrfbdy_d01") + ofp = fp + self.settings["original_save_suffix"] + src = nc.Dataset(ofp, "r") + dst = nc.Dataset(fp, "r+") + quantities = ["BXS", "BXE", "BYS", "BYE"] + for mem in members: + offset_value = mem.param_values[-1] + var_names = [self.settings["tracer_optim"] + \ + "_" + nmember_format % mem.membernumber + \ + "_" + q for q in quantities] + for var_name in var_names: + var_src = src.variables[var_name][:] + dst.variables[var_name][:] = var_src + offset_value + src.close() + dst.close() + + # Write wrfinput + for domain in range(1, n_domains+1): + fp = os.path.join(self.settings["run_dir"], + "wrfinput_d%02d" % domain) + ofp = fp + self.settings["original_save_suffix"] + src = nc.Dataset(ofp, "r") + dst = nc.Dataset(fp, "r+") + for mem in members: + offset_value = mem.param_values[-1] + var_name = self.settings["tracer_optim"] + \ + "_" + nmember_format % mem.membernumber + var_src = src.variables[var_name][:] + dst.variables[var_name][:] = var_src + offset_value + src.close() + dst.close() + + logging.info("write_members_to_file done.") + + + ########## Private methods + + def _prepare_run(self, samples): + """Prepare a forward model WRF-Chem run. + + - Update the WRF's configuration file (namelist.input) + - Define file name where sampled simulated observations + are stored. + """ + + # Update parameters for new run (i.e., start and stop time) + self._update_namelist_file() + + # Define file name of sampled simulated observations + # Store pattern for finding the right file in write_simulated*, + # because they don't know the integer indices + self._sim_fpattern = "wrf_sampled_%%s_%s.nc" % self.dacycle["time.sample.stamp"] + + # Also construct full filename because pipeline relies on + # integer indices + self.simulated_file = [None] * len(samples) + for i, sample in enumerate(samples): + self.simulated_file[i] = os.path.join( + self.settings["run_dir"], + self._sim_fpattern % sample.get_samples_type()) + + # Prepare the track input file - NOT USED + #self._prepare_wrfinput_track() + + + + def _update_namelist_file(self): + """Update namelist.input file based on dacycle object + + Updated values: + - start time, end time + - restart timestep + - restart flag + + Note: format of namelist.input is slightly different after + running this: + - 19, 19, -> 19, 19 + - 00 -> 0 + - 2. -> 2.0 + - "wrfchemi_d<domain>_<date>" -> "wrfchemi_d<domain>_<date>" + This could be adjusted in f90nml settings, but it doesn't + seem to matter to WRF. + """ + + # Set start and end time in namelist + self._set_namelist_time("start_", self.dacycle["time.sample.start"]) + self._set_namelist_time("end_", self.dacycle["time.sample.end"]) + + # Set restart interval + # Compute it (as integer) + intrvl_timedelta = (self.dacycle["time.sample.end"] + - self.dacycle["time.sample.start"]) + intrvl_mins = intrvl_timedelta.total_seconds()/60. + if np.abs(intrvl_mins-np.floor(intrvl_mins)) > np.finfo(float).eps: + msg = "Cannot safely cast restart_interval in WRF " + \ + "namelist to integer. Without this casting, " + \ + "restart-files might not be created. Stopping." + raise ValueError(msg) + # Write to namelist + self.namelist["time_control"]["restart_interval"] = int(intrvl_mins) + + # Set restart flag + # If this is a restart from a previous cycle, or a previous lag, + # the WRF model should start from restart files + # (time.sample.window is also used by TM5 observation operator to set + # the restart flag, so I guess this is the right value to use...) + if self.dacycle["time.restart"] or self.dacycle["time.sample.window"] != 0: + logging.info("Configuring WRF-Chem to perform restart.") + self.namelist["time_control"]["restart"] = True + else: + # Note: In case you ever want to start the first cycle with + # restart files, change this branch to "pass" + + logging.info("Configuring WRF-Chem to perform cold start.") + + self.namelist["time_control"]["restart"] = False + + # Write namelist changes to file + fp = os.path.join(self.settings["run_dir"], "namelist.input") + fp_original = fp + self.settings["original_save_suffix"] + f90nml.patch(fp_original, self.namelist, fp) + + def _set_namelist_time(self, prefix, date): + """Insert datetime object into namelist dictionary.""" + + if not isinstance(date, dtm.datetime): + logging.error("set_namelist_time: argument date must be a " + \ + "datetime.datetime- object") + + # Incidentally, names of datetime components WRF namelists are + # identical to those in datetime objects. Therefore, we can set + # them by iterating over their names. + time_components = ["year", "month", "day", "hour", "minute", "second"] + for time_component in time_components: + self.namelist["time_control"][prefix + time_component] = \ + self._repeat_per_domain(getattr(date, time_component)) + + + def _repeat_per_domain(self, value): + """Output: list of 'value' repeated max_dom times.""" + + n_domains = self.namelist["domains"]["max_dom"] + return [value] * n_domains + + + # def _prepare_wrfinput_track(self): + # """Prepares wrfinput_track.txt for this run + # + # Original version of track_driver can't deal + # with times outside of simulation period. So + # I'm cutting the original file down here. + # STATUS: NOT USED + # """ + # + # run_dir = self.settings["run_dir"] + # fp = os.path.join(run_dir, "wrfinput_track.txt") + # fp_original = fp + self.settings["original_save_suffix"] + # + # if not os.path.exists(fp_original): + # return + # + # # Read original track + # # Read all as str: have to parse times, and will keep format of + # # locations + # dat = np.loadtxt(fp_original, dtype=str) + # + # # Select lines that are in current lag + # times = [dtm.datetime.strptime(t_chr, "%Y-%m-%d_%H:%M:%S") for t_chr in dat[:, 0]] + # + # time_lag_start = self.dacycle["time.sample.start"] + # time_lag_end = self.dacycle["time.sample.end"] + # + # # Write track file for this run + # # Have to exclude 1st and include last timestep here because that's the + # # way track_driver needs it! + # # This is unlike the other cases (fluxes and wrfout), where + # # I include the start and exclude the end of the lag + # f = open(fp, "w") + # for n in range(dat.shape[0]): + # if times[n] > time_lag_start and times[n] <= time_lag_end: + # f.write(" ".join(dat[n, :]) + "\n") + # f.close() + + + def _load_rc(self, name): + """Read settings from the observation operator's rc-file + + Based on TM5ObservationOperator.load_rc + """ + + self.rcfile = rc.RcFile(name) + self.settings = self.rcfile.values + self.rc_filename = name + + logging.debug("rc-file %s loaded", name) + + def _validate_rc(self): + """Check that some required values are given in the rc-file. + + Based on TM5ObservationOperator.validate_rc + """ + + needed_rc_items = ["source_dir", "run_dir"] + + for key in needed_rc_items: + if key not in self.settings: + msg = "Missing a required value in rc-file : %s" % key + logging.error(msg) + raise IOError(msg) + logging.debug("rc-file has been validated succesfully") + + def _create_fresh_run_dir(self, overwrite): + """ Set up the directory where WRF will run. + + Steps: + - Create target_directory + - From self.settings["source_dir"] to target_directory, + - copy files that need to be modified during runtime + - link files that need not be modified + - Useful to set overwrite=False for resubmitting a + failed run. Note: namelist.input is _always_ copied + regardless of the value of overwrite. + """ + + # Create target_dir + target_dir = self.settings["run_dir"] + create_dirs(target_dir) + + # By default, files are linked from source directory to + # target_dir. Define here which files need to be copied + # instead of linked, i.e., those that need to be modified + # During runtime. + # They are described by patterns that are compared to file + # names via re.match(pattern, filename) + copy_pattern_list = ["^namelist\.input$", + "^wrfchemi_.*", + # "^wrfinput_track\.txt$" # NOT USED + ] + + if self.dacycle.dasystem["do_offset"]: + copy_pattern_list.append("wrfinput_d.*") + copy_pattern_list.append("wrfbdy_d.*") + + # Since I don't know concise code to check for more than one + # pattern without a loop, create one single pattern for all. + copy_full_pattern = "(" + "|".join(copy_pattern_list) + ")" + + # Skip some files + skip_pattern_list = ["^wrfout.*", + "^wrfrst.*", + "namelist\.output", + ".*~$", + "log", + "^rsl.error.*", + "^rsl.out.*", + "sampled_.*", + "met_em.*" + ] + skip_full_pattern = "(" + "|".join(skip_pattern_list) + ")" + + # Get list of all files in source directory + source_dir = self.settings["source_dir"] + source_files = os.listdir(source_dir) + + # Copy/link files to target_dir + logging.info("Populating WRF run directory. This may take a while.") + for filename in source_files: + # Do nothing for files to skip + if re.match(skip_full_pattern, filename): + continue + + # Copy/link files + fp_source = os.path.join(source_dir, filename) + fp_destination = os.path.join(target_dir, filename) + # Skip if the file is already there and overwrite is false + if (os.path.isfile(fp_destination)) and (not overwrite): + continue + if re.match(copy_full_pattern, filename): + # Copy files to copy (with overwrite) + shutil.copy2(fp_source, target_dir) + else: + # Link files to link (with overwrite) + if os.path.exists(fp_destination): + os.remove(fp_destination) + os.symlink(fp_source, fp_destination) + + # create a log dir + create_dirs(os.path.join(target_dir, "log")) + + # Save original namelist.input - ignore overwrite. + # Note: this is useful in case you restart CTDAS with an updated namelist. + fp_source = os.path.join(source_dir, "namelist.input") + shutil.copy2(fp_source, target_dir) + self._save_original_files([os.path.join(target_dir, "namelist.input")], True) + + # # Save original wrfinput_track.txt (if it exists) - NOT USED + # fp_source = os.path.join(source_dir, "wrfinput_track.txt") + # if os.path.exists(fp_source): + # shutil.copy2(fp_source, target_dir) + # self._save_original_files([os.path.join(target_dir, "wrfinput_track.txt")], True) + + + def _prepare_flux_files(self, overwrite=True): + """Prepare flux input files for WRF-Chem + + - Save original flux files + - Create fields for ensemble members if not yet present + + I want to get rid of saving original files, and the only + practical way to do the latter is in preprocessing because + here it's not parallelized. So either parallelize or remove. + Could replace with _validate_emissions. + """ + + # Flux files for this inversion period + all_files = self.wrfhelper.get_wrf_filenames("wrfchemi*00") + + ti = self.dacycle["time.start"] + days_plus = int(self.dacycle["time.nlag"])*int(self.dacycle["time.cycle"]) + te = self.dacycle["time.finish"] + dtm.timedelta(days_plus) + logging.warning("Processing until: te = " + te.isoformat() + ". Too far?") + pattern_time = "%Y-%m-%d_%H:%M:%S" + len_tstamp = len(pattern_time) + 2 + all_times = [dtm.datetime.strptime(f[-len_tstamp:], pattern_time) + for f in all_files] + wrfchemi_files = [all_files[nf] for nf in range(len(all_files)) + if ti <= all_times[nf] and all_times[nf] <= te] + + # Rename chemistry input files to wrfchemi*.original + self._save_original_files(wrfchemi_files, overwrite) + logging.info("Original flux files saved.") + + # Write placeholder fields for optimized fluxes into wrfchemi files + # -> This takes a lot of time for a large ensemble because it's done + # serially. The only practical way is to do this in preprocessing, + # so that should become the only allowed way. + logging.info("Creating flux fields for ensemble in flux files.") + prefix = self.settings["flux_optim"] + self._create_member_fields(wrfchemi_files, prefix, "", "zeros", False) + logging.info("Flux files prepared.") + + + def _save_original_files(self, files, overwrite=True): + """ Make a copy of files with a predefined suffix + + For development, add the option to skip (because it + takes a while). + files = list + """ + + suffix = self.settings["original_save_suffix"] + for file_ in files: + new_filename = file_ + suffix + if (os.path.isfile(new_filename)) and (not overwrite): + logging.debug("File %s exists, skipping", new_filename) + continue + shutil.copy2(file_, new_filename) + logging.debug("Copied %s to %s", file_, new_filename) + + + def _create_member_fields(self, files, prefix, suffix="", + mode="zeros", overwrite=True): + """ Create fields for each ensemble member for the original prefix+suffix. + + Since this takes a long time, I add the option to not overwrite in + case the fields are already there, i.e. created in preprocessing. + This is actually the only practical way at the moment. Either edit + this function out or parallelize. + + This assumes that original files were previously saved with + _self._save_original_files(). + + mode = "zeros": All zeros (will be used for emissions) + mode = "copy": Copy original (was for boundary- and initial + conditions, but that's not done anymore.) + files = list + Attributes are copied in any case. The member number is added to the + "description" attribute. + + """ + + # For each domain, and each file, write a field of 0's for each + # ensemble member (<prefix>_000, ...) + # Netcdf operations from https://stackoverflow.com/questions/15141563/python-netcdf-making-a-copy-of-all-variables-and-attributes-but-one + + # Specify directory to operate in + run_dir = self.settings["run_dir"] + # suffix for original + orig_suf = self.settings["original_save_suffix"] + # Number of ensemble members + nmembers = int(self.dacycle["da.optimizer.nmembers"]) + # Format of ensemble members + nmember_format = "%03d" + # Name of emission fields to copy = prefix+suffix + var_orig = prefix+suffix + # Create arrays of 0s for complete emission field in all files + have_src = False + for f in files: + # File path + fp = os.path.join(run_dir, f) + # Get variable that you want to emulate from original file + ofp = fp + orig_suf + src = nc.Dataset(ofp, "r") + src_var = src.variables[var_orig] + if mode == "zeros": + if not have_src: + # Only create this once + field_to_write = np.zeros(src_var.shape, dtype=src_var.dtype) + have_src=True + elif mode == "copy": + field_to_write = src_var[:] # always need new source + else: + raise ValueError("Unknown mode: %s" % mode) + + # Open netcdf handle in new file + dst = nc.Dataset(fp, "r+") + + for n in range(nmembers): + nmember_formatted = nmember_format % n + # field name for this member + fn = "".join([prefix, "_", nmember_formatted, suffix]) + # Skip if it should not be overwritten + if (fn in list(dst.variables.keys())) and (not overwrite): + continue + # Create 0-field for this member + dst.createVariable(fn, src_var.dtype, src_var.dimensions) + # Write the data + dst.variables[fn][:] = field_to_write + # Copy variable attributes all at once via dictionary + dst.variables[fn].setncatts(src_var.__dict__) + # Add ensemble member number to description of variable + dst.variables[fn].setncattr( + "description", + dst.variables[fn].getncattr("description") + \ + " ensemble_member_" + nmember_formatted) + src.close() + dst.close() + + + def _validate_input(self): + """Validate that WRF has all the inputs it needs to run + + Mostly only checks existence of files, not their content. + """ + + # In all cases: Validate parameters and emissions. + self._validate_parameters() + #self._validate_emissions() # NOT USED + + # Validate different inputs needed for cold start and restart + if self.namelist["time_control"]["restart"]: + logging.info("Restart run: Validate for restart files,") + self._validate_restart_files() + else: + logging.info("Cold start: Validate initial and boundary " + + "conditions.") + self._validate_boundary_conditions() + self._validate_initial_conditions() + + + + def _validate_parameters(self): + """Checks namelist.input and number of allocated tasks + + namelist.input (WRF's configuration file) can contain + instructions on how many tasks to use. This function + checks whether namelist.input itself exists, and the + number of allocated tasks is compatible with values + in the namelist. + """ + + # Check if WRF configuration file exists (kinda moot, it's + # been read in before... + self._confirm_file_exists("namelist.input") + + # Confirm that the allocated number of tasks are enough + # in case numbers of tasks were explicitly given in the + # namelist. + self._validate_ntasks() + + + def _nprocs_namelist(self): + """Compute how many tasks namelist.input requests""" + + + + # Check variables in namelist that specify the number of tasks + got_x = "nproc_x" in list(self.namelist["domains"].keys()) + got_y = "nproc_y" in list(self.namelist["domains"].keys()) + + got_quilt = "namelist_quilt" in list(self.namelist.keys()) + if got_quilt: + got_iog = "nio_groups" in list(self.namelist["namelist_quilt"].keys()) + got_iot = "nio_tasks_per_group" in list(self.namelist["namelist_quilt"].keys()) + else: + got_iog = False + got_iot = False + + # Compute requested number of compute processes + if got_x or got_y: + if got_x != got_y: + logging.error("nproc_x xor nproc_y specified in WRF " + \ + "namelist, don't know how to deal with that.") + + nproc_x = int(self.namelist["domains"]["nproc_x"]) + nproc_y = int(self.namelist["domains"]["nproc_y"]) + ntasks_compute = nproc_x*nproc_y + else: + ntasks_compute = 0 + + # Compute requested number of IO processes + if got_iog or got_iot: + if got_iog != got_iot: + logging.error("nio_groups xor nio_tasks_per_group " + \ + "specified in WRF namelist, don't " + \ + "know how to deal with that.") + nio_groups = int(self.namelist["namelist_quilt"]["nio_groups"]) + nio_tasks_per_group = int(self.namelist["namelist_quilt"]["nio_tasks_per_group"]) + ntasks_io = nio_groups*nio_tasks_per_group + else: + ntasks_io = 0 + + # The number of requested process is the sum of requested + # compute and io tasks + nprocs = ntasks_compute + ntasks_io + return nprocs + + def _validate_ntasks(self): + """ Confirm that the allocated number of tasks are enough + + In case numbers of tasks were explicitly given in the namelist. + """ + + # Number of tasks as specified in CycleControl (comes from + # rc-file passed to CTDAS) + ntasks_cc = int(self.dacycle["da.resources.ntasks"]) + + # Number of processes requested by namelist parameters + ntasks_namelist = self._nprocs_namelist() + + # Compare the two. + if ntasks_namelist > ntasks_cc: + logging.error("WRF namelist requests more processes " + \ + "than available in the job.") + else: + logging.info("Number of allocated tasks OK for WRF namelist.") + + + def _validate_emissions(self): + """Does nothing + + Should check if emissions files exist for this period. But + this would require diving into how emissions are saved (e.g. + per day, hour, ...), so I'm omitting it. If files are missing, + the run will crash later. + """ + + pass + + def _validate_restart_files(self): + """Does nothing + + Should check whether the restart files for this run exist, + but filenames depend on io_form_restart. + """ + + #ndomains = self.namelist["domains"]["max_dom"] + #time_format = self.dacycle["time.sample.start"].strftime("%Y-%m-%d_%H:%M:%S") + #for n_d in range(1, ndomains + 1): + # restart_file = "wrfrst_d{domain:02d}_".format(domain=n_d) + time_format + # self._confirm_file_exists(restart_file) + + pass + + + def _validate_boundary_conditions(self): + """Check if boundary condition file exists""" + + # Check if boundary conditions file exist + self._confirm_file_exists("wrfbdy_d01") + + def _validate_initial_conditions(self): + """Check if initial conditions files exist""" + + ndomains = self.namelist["domains"]["max_dom"] + for n_d in range(1, ndomains + 1): + input_file = "wrfinput_d{:02d}".format(n_d) + self._confirm_file_exists(input_file) + + def _confirm_file_exists(self, filename): + """Throw an error if <filename> is not present in run_dir""" + + filepath = os.path.join(self.settings["run_dir"], filename) + if not os.path.exists(filepath): + msg = "File %s not found." % filename + logging.error(msg) + raise OSError(msg) + + logging.debug("File %s found.", filename) + + + def _run(self): + """Run the WRF model by submitting wrf.exe to slurm. + + Sources on how to do this: + - https://stackoverflow.com/questions/32594734/slurm-multiprocessing-python-job + - https://pythonspot.com/python-subprocess/ + - submit_tm5_tools.py + + It would be cleaner to replace "run_command" and so on with + calls to the platform module. However, running TM5 is handled + by submit_tm5 and submit_tm5_tools.py instead. On the machine + where the model was developed, only full nodes can be reserved + anyway. So I copy this behavior. + """ + + # Option to run only if you haven't yet for this period + # Useful for debegging everything that comes after running + # the transport model. + + if "no_rerun_wrf" in list(self.settings.keys()): + if self.settings["no_rerun_wrf"].lower() in ["true", "t", "1"]: + if self._has_wrfout(): + logging.info("no_rerun_wrf = True and transport " + \ + "model output already present for " + \ + "run period. Therefore, not rerunning " + \ + "model and using existing output.") + return + elif self.settings["no_rerun_wrf"].lower() in ["false", "f", "0"]: + pass + else: + msg = "Don't understand value " + \ + "'%s' " % self.settings["no_rerun_wrf"] + \ + "for rc-option 'no_rerun_wrf'." + logging.error(msg) + raise ValueError(msg) + + run_dir = self.settings["run_dir"] + + # Change to wrf run directory + cwd = os.getcwd() + os.chdir(run_dir) + + # Build command + # Basic mpirun command given in rc-file + command_ = self.settings["run_command"] + # Get number of tasks + ntasks = int(self.dacycle["da.resources.ntasks"]) + if self.settings["run_ntask_opt"]: + command_ += " " + self.settings["run_ntask_opt"] + str(ntasks) + # Request a runtime. Not that important, actually, it's just a job step. + if self.settings["run_t_opt"]: + command_ += " " + self.settings["run_t_opt"] + self.settings["run_t"] + # Add executable and redirection of output. + command = command_ + " ./wrf.exe >& log/wrf.log" + + # Run WRF model + logging.info("Starting WRF model with command: %s" %command) + p = subprocess.Popen(command.split(), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + # Wait for process to finish (may not be needed) + p.wait() + logging.info("WRF model run complete.") + + # Check output + retcode = utilities.check_out_err(p) + if retcode != 0: + raise OSError("WRF returned errors. Check logs in %s" % run_dir) + + # Switch back to previous working directory + os.chdir(cwd) + + def _has_wrfout(self): + """Check if there are wrfout files for start and end of this lag + + Purpose: Option to not rerun the transport model but go straight + to sampling wrfout files, used during development of the sampler. + """ + + #pylint: disable=unreachable + + run_dir = self.settings["run_dir"] + + t1 = self.dacycle["time.sample.start"] + t2 = self.dacycle["time.sample.end"] + + fpattern = "wrfout_d01_%Y-%m-%d_%H:%M:%S" + fnames = [t.strftime(fpattern) for t in [t1, t2]] + fpaths = [os.path.join(run_dir, fn) for fn in fnames] + files_exist = list(map(os.path.exists, fpaths)) + + return all(files_exist) + + + def _sample_simulated_obs(self, samples): + """Sample simulated observations + + Calls functions that sample 4D modeled mixing ratio fields + at the times and locations of observations. + """ + + for sample in samples: + sample_type = sample.get_samples_type() + if sample_type == "column": + logging.info("Starting _launch_wrfout_column_sampling") + self._launch_wrfout_column_sampling(sample) + logging.info("Finished _launch_wrfout_column_sampling") + elif sample_type == "point": + self._launch_wrfout_point_sampling(sample) + else: + logging.error("Unknown sample type: %s", + sample.get_samples_type()) + + + def _launch_wrfout_point_sampling(self, samples): + """Raises an error that says it hasn't been implemented. + + Should sample WRF output at coordinates of point observations. + Not implemented yet. + """ + + msg = "Point sampling not implemented." + logging.error(msg) + raise NotImplementedError(msg) + + + def _launch_wrfout_column_sampling(self, sample): + """Sample WRF output at coordinates of column observations.""" + + run_dir = self.settings["run_dir"] + + sampling_coords_file = os.path.join( + self.dacycle["dir.input"], + sample.get_samples_type() + \ + "_coordinates_%s.nc" % self.dacycle["time.sample.stamp"]) + + # Reconstruct self.simulated_file[i] + out_file = self._sim_fpattern % sample.get_samples_type() + + # Remove intermediate files from a previous sampling job (might + # still be there if that one fails) + # The file pattern is hardcoded in wrfout_sampler + slicefile_pattern = out_file + ".*.slice" + for f in glob.glob(os.path.join(run_dir, slicefile_pattern)): + os.remove(f) + + # Spawn multiple wrfchem_sampler instances, + # using at most all processes available + nprocs1 = int(self.dacycle["da.resources.ntasks"]) + + # Might not want to use that many processes if there are few + # observations, because of overhead. Set a minimum number of + # observations per process, and reduce the number of + # processes to hit that. + Nobs = len(sample.datalist) + if Nobs == 0: + logging.info("No observations, skipping sampling") + return + + # Might want to increase this, no idea if this is reasonable + nobs_min = 100 + nprocs2 = max(1, int(float(Nobs)/float(nobs_min))) + + # Number of processes to use: + nprocs = min(nprocs1, nprocs2) + + # Make run command + # For a task with 1 processor, specifically request -N1 because + # otherwise slurm apparently sometimes tries to allocate one task to + # more than one node. Or something like that. See here: + # https://stackoverflow.com/questions/24056961/running-slurm-script-with-multiple-nodes-launch-job-steps-with-1-task + command_ = self.settings["run_command_1task"] + + # Specifically request memory per process (otherwise the first + # one would clog the node and you end up with serial execution) + if self.settings["run_mem_opt"]: + max_nprocs_per_node = min(int(self.settings["cpus_per_node"]), nprocs) + mb_per_process = np.floor(int(self.settings["mem_per_node"])*1024. / float(1+max_nprocs_per_node)) # The +1 is to leave a bit spare memory + mem_str = str(int(mb_per_process)) + self.settings["mem_unit_str_MB"] + command_ += " " + self.settings["run_mem_opt"] + mem_str + + # Request a runtime. Not that important, actually, it's just a + # job step. + if self.settings["run_t_opt"]: + command_ = command_ \ + + " " + self.settings["run_t_opt"] \ + + " " + self.settings["run_t"] + + # Check if output slice files are already present + # This shouldn't happen, because they are deleted + # a few lines above. But if for some reason (crash) + # they are still here, this might lead to funny behavios. + if nprocs > 1: + output_files = glob.glob(slicefile_pattern) + if len(output_files) > 0: + msg = "Files that match the pattern of the " + \ + "sampler output are already present. Stopping." + logging.error(msg) + raise OSError(msg) + + # Submit processes + procs = list() + for nproc in range(nprocs): + cmd = " ".join([ + command_, + "python ./da/tools/wrfchem/wrfout_sampler.py", + "--nproc %d" % nproc, + "--nprocs %d" % nprocs, + "--sampling_coords_file %s" % sampling_coords_file, + "--run_dir %s" % run_dir, + "--original_save_suffix %s" % self.settings["original_save_suffix"], + "--nmembers %d" % int(self.dacycle["da.optimizer.nmembers"]), + "--tracer_optim %s" % self.settings["tracer_optim"], + "--outfile_prefix %s" % out_file, + "--footprint_samples_dim %d" % int(self.settings['obs.column.footprint_samples_dim']) + ]) + + procs.append(subprocess.Popen(cmd.split(), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT)) + + logging.info("Started %d sampling process(es).", nprocs) + logging.debug("Command of last process: %s", cmd) + + # Wait for all processes to finish + for n in range(nprocs): + procs[n].wait() + + # Check for errors + retcodes = [] + for n in range(nprocs): + logging.debug("Checking errors in process %d", n) + retcodes.append(utilities.check_out_err(procs[n])) + + if any([r != 0 for r in retcodes]): + raise RuntimeError("At least one sampling process " + \ + "finished with errors.") + + logging.info("All sampling processes finished.") + + # Join output files + logging.info("Joining output files.") + if nprocs > 1: + utilities.cat_ncfiles(run_dir, slicefile_pattern, + "sounding_id", out_file, + in_pattern=True) + + logging.info("WRF column output sampled.") + logging.info("If samples object carried observations, output " + \ + "file written to %s", self.simulated_file) + + + def _save_data(self): + """Saves data from this lag + + This doesn't *have* to do anything. WRF is always started from + the same directory and takes care of the restarts itself, so no + files have to be collected (which is what happens here for TM5). + However, if "save_wrf_step" is set to True in the rc-file, then + all outputs and the namelist are copied to a subfolder for + debugging. + """ + + # For compatibility: + self.restart_filelist = [] + self.output_filelist = [] + + # + + # Relevant wrf files that change each time wrf is run + if "save_wrf_step" in list(self.settings.keys()): + if self.settings["save_wrf_step"].lower() in ["true", "t", "1"]: + # Keep some files + time_lag_start = self.dacycle["time.sample.start"] + timestamp = time_lag_start.strftime("_lag%Y%m%d") + + path = os.path.join(self.settings["run_dir"], "wrf"+timestamp) + logging.info("Saving wrf files to " + path) + + if not os.path.exists(path): + os.makedirs(path) + + # NOT USED + # wrfout_track + # pattern = os.path.join(self.settings["run_dir"], + # "wrfout_track_*") + # files = glob.glob(pattern) + # for f in files: + # self.wrfhelper.save_file_with_timestamp(f, path, timestamp) + + # Prepare timesteps + ti = self.dacycle["time.sample.start"] + te = self.dacycle["time.sample.end"] + pattern_time = "%Y-%m-%d_%H:%M:%S" + len_tstamp = len(pattern_time) + 2 + + # List wrfout files + file_pattern = "wrfout*" + all_files = self.wrfhelper.get_wrf_filenames(file_pattern) + all_times = [dtm.datetime.strptime(f[-len_tstamp:], pattern_time) + for f in all_files] + + # Save wrfout files that belong to this lag + for nf in range(len(all_files)): + if ti <= all_times[nf] and all_times[nf] <= te: + self.wrfhelper.save_file_with_timestamp(all_files[nf], path, timestamp) + + # Restart files + file_pattern = "wrfrst*" + all_files = self.wrfhelper.get_wrf_filenames(file_pattern) + all_times = [dtm.datetime.strptime(f[-len_tstamp:], pattern_time) + for f in all_files] + for nf in range(len(all_files)): + if all_times[nf] == te: + self.wrfhelper.save_file_with_timestamp(all_files[nf], path, timestamp) + + # rsl files + file_pattern = "rsl.*" + all_files = self.wrfhelper.get_wrf_filenames(file_pattern) + for nf in range(len(all_files)): + self.wrfhelper.save_file_with_timestamp(all_files[nf], path, timestamp) + + # Namelist + in_path = os.path.join(self.settings["run_dir"], "namelist.input") + self.wrfhelper.save_file_with_timestamp(in_path, path, timestamp) + + + # def _save_track_output(self): + # """ + # Save output from track_driver by adding a timestamp. + # STATUS: NOT USED + # """ + # run_dir = self.settings["run_dir"] + # ndomains = self.namelist["domains"]["max_dom"] + # time_lag_start = self.dacycle["time.sample.start"] + # timestamp = time_lag_start.strftime("%Y-%m-%d_%H:%M:%S") + # # for testing purposes, also add system time + # nowstamp = dtm.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + # timestamp += "_"+nowstamp + # + # for domain in range(1, ndomains+1): + # fn = "wrfout_track_d%02d" % domain + # fp = os.path.join(run_dir, fn) + # if os.path.exists(fp): + # shutil.copy2(fp, fp + "_" + timestamp) + + def _sample_domain_map(self, full_map, domain): + """Sample domain map + + Takes as input a <full_map> with values for domain d01 in + resolution of the finest domain (= format of regions file, + output of vector2grid) and returns values for extent and + resolution of <domain>. + """ + + # Start indices (0-based) + start1 = self._start_in_1(full_map, domain) + start = [s1-1 for s1 in start1] + if any([s<0 for s in start]): + raise ValueError("Something is wrong with start indices") + + # Step sizes + strides = self._strides(full_map, domain) + + # Stop indices: add domain size to start + stop = start[:] + dom = self.namelist["domains"] + # -1 here to have #cells not #edges + stop[0] = start[0] + (dom["e_we"][domain-1]-1)*strides[0] + stop[1] = start[1] + (dom["e_sn"][domain-1]-1)*strides[1] + + # Now sample + # Sampling indices as sequence + xseq = list(range(start[0], stop[0], strides[0])) + yseq = list(range(start[1], stop[1], strides[1])) + # Sampling indices as grid + xx, yy = np.meshgrid(xseq, yseq) + + # Sample: ATTENTION: axis 0 refers to the latitude! + sampled_map = full_map[yy, xx] + + return sampled_map + + def _start_in_1(self, full_map, domain): + """Start index of domain in full_map, 1-based""" + domains = self._parent_domains(domain) + domains.append(domain) + istart = 1 # i refers to x (as e_we) + jstart = 1 # j refers to y (as e_sn) + for ndom in range(len(domains)-1): + this_domain = domains[ndom+1] + this_parent = domains[ndom] + istart += (self.namelist["domains"]["i_parent_start"][this_domain-1]-1) \ + *self._strides(full_map, this_parent)[0] + jstart += (self.namelist["domains"]["j_parent_start"][this_domain-1]-1) \ + *self._strides(full_map, this_parent)[1] + return [istart, jstart] + + def _strides(self, full_map, domain): + """Step size in <full_map> for <domain> + + I.e. the product of the parent grid ratios between + finest domain and <domain> + """ + dom = self.namelist["domains"] + # On which resolution is full_map? stride of domain 1: + # (Attention: index 1 refers to e_we, index 0 refers to e_sn) + stride1_x = full_map.shape[1] / (dom["e_we"][0]-1) + stride1_y = full_map.shape[0] / (dom["e_sn"][0]-1) + # stride = product of parent grid ratios + if domain == 1: + ratio_dom = 1 + else: + domains = self._parent_domains(domain) + domains.append(domain) + ratio_dom = np.array(dom["parent_grid_ratio"])[np.array(domains)-1].prod() + # round() converts to int more safely. Still assumes the + # namelist and regions file are compatible, if not, this + # might return wrong results. + stride_x = int(round(stride1_x/ratio_dom)) + stride_y = int(round(stride1_y/ratio_dom)) + + return (stride_x, stride_y) + + def _parent_domains(self, domain): + """Figure out the domain parents of this domain.""" + + if domain == 1: + # No parents + return [] + else: + # Add parent domain to list + # "-1" because domain numbers start at 1 + parent = self.namelist["domains"]["parent_id"][domain-1] + # Add parents of parent to list + parents_of_parent = self._parent_domains(parent) + parents_of_parent.append(parent) + # Done + return parents_of_parent + + + # Obsolete: + #def _get_ntasks_max(self): + # """Get the number of tasks that can be spawned from this process.""" + # os_environ_ntask_var = self.settings["os_environ_ntask_var"] + # if os_environ_ntask_var in os.environ.keys(): + # # I believe 1 process is taken up by this one, so -1 + # ntasks = int(os.environ[os_environ_ntask_var]) - 1 + # else: + # ntasks = int(self.settings["ntasks_max_default"]) + # msg = "%s not detected in " % os_environ_ntask_var + \ + # "environment variables. Setting ntasks=%d" % ntasks + # logging.warning(msg) + # + # return ntasks + + +if __name__ == "__main__": + pass diff --git a/da/pipelines/pipeline_wrfchem.py b/da/pipelines/pipeline_wrfchem.py new file mode 100755 index 00000000..2cb6a2d5 --- /dev/null +++ b/da/pipelines/pipeline_wrfchem.py @@ -0,0 +1,469 @@ +"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +updates of the code. See also: http://www.carbontracker.eu. + +This program is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software Foundation, +version 3. This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this +program. If not, see <http://www.gnu.org/licenses/>.""" +#!/usr/bin/env python +# pipeline.py + +""" +.. module:: pipeline +.. moduleauthor:: Wouter Peters + +Revision History: + +2021-07-10 freum Version for wrfchem +This is the wrfchem version, which was started in 2019 and does not yet include +all changes that were made to the original pipeline since then. They can probably +be integrated easily, so that wrfchem can use the standard pipeline. +Differences to original: +- Everything related to "inversion.savestate.dir" is not included here +- Instead of "if sample.get_samples_type() == 'flask': ", samples includes an + empty function write_sample_auxiliary + +File created on 06 Sep 2010. + +The pipeline module holds methods that execute consecutive tasks with each of the objects of the DA system. + +""" +import logging +import os +import sys +import datetime +import copy + +header = '\n\n *************************************** ' +footer = ' *************************************** \n ' + + +def ensemble_smoother_pipeline(dacycle, platform, dasystem, samples, statevector, obsoperator, optimizer): + """ The main point of entry for the pipeline """ + sys.path.append(os.getcwd()) + + samples = samples if isinstance(samples,list) else [samples] + + logging.info(header + "Initializing current cycle" + footer) + start_job(dacycle, dasystem, platform, statevector, samples, obsoperator) + + prepare_state(dacycle, statevector) + + sample_state(dacycle, samples, statevector, obsoperator) + + invert(dacycle, statevector, optimizer) + + advance(dacycle, samples, statevector, obsoperator) + + save_and_submit(dacycle, statevector) + logging.info("Cycle finished...exiting pipeline") + + + +def forward_pipeline(dacycle, platform, dasystem, samples, statevector, obsoperator): + """ The main point of entry for the pipeline """ + sys.path.append(os.getcwd()) + + samples = samples if isinstance(samples,list) else [samples] + + logging.info(header + "Initializing current cycle" + footer) + start_job(dacycle, dasystem, platform, statevector, samples, obsoperator) + + if 'forward.savestate.exceptsam' in dacycle: + sam = (dacycle['forward.savestate.exceptsam'].upper() in ["TRUE","T","YES","Y"]) + else: + sam = False + + if 'forward.savestate.dir' in dacycle: + fwddir = dacycle['forward.savestate.dir'] + else: + logging.debug("No forward.savestate.dir key found in rc-file, proceeding with self-constructed prior parameters") + fwddir = False + + if 'forward.savestate.legacy' in dacycle: + legacy = (dacycle['forward.savestate.legacy'].upper() in ["TRUE","T","YES","Y"]) + else: + legacy = False + logging.debug("No forward.savestate.legacy key found in rc-file") + + if not fwddir: + # Simply make a prior statevector using the normal method + prepare_state(dacycle, statevector)#LU tutaj zamiast tego raczej to stworzenie nowej kowariancji i ensembli bo pozostale rzeczy sa na gorze i na doel. + else: + # Read prior information from another simulation into the statevector. + # This loads the results from another assimilation experiment into the current statevector + + if sam: + filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate_%s.nc'%dacycle['time.start'].strftime('%Y%m%d')) + #filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate.nc') + statevector.read_from_file_exceptsam(filename, 'prior') + elif not legacy: + filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate_%s.nc'%dacycle['time.start'].strftime('%Y%m%d')) + statevector.read_from_file(filename, 'prior') + else: + filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate.nc') + statevector.read_from_legacy_file(filename, 'prior') + + + # We write this "prior" statevector to the restart directory, so we can later also populate it with the posterior statevector + # Note that we could achieve the same by just copying the wanted forward savestate.nc file to the restart folder of our current + # experiment, but then it would already contain a posterior field as well which we will try to write in save_and_submit. + # This could cause problems. Moreover, this method allows us to read older formatted savestate.nc files (legacy) and write them into + # the current format through the "write_to_file" method. + + savefilename = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['time.start'].strftime('%Y%m%d')) + statevector.write_to_file(savefilename, 'prior') + + # Now read optimized fluxes which we will actually use to propagate through the system + + if not fwddir: + # if there is no forward dir specified, we simply run forward with unoptimized prior fluxes in the statevector + logging.info("Running forward with prior savestate from: %s"%savefilename) + + else: + # Read posterior information from another simulation into the statevector. + # This loads the results from another assimilation experiment into the current statevector + + if sam: + statevector.read_from_file_exceptsam(filename, 'opt') + elif not legacy: + statevector.read_from_file(filename, 'opt') + else: + statevector.read_from_legacy_file(filename, 'opt') + + logging.info("Running forward with optimized savestate from: %s"%filename) + + # Finally, we run forward with these parameters + advance(dacycle, samples, statevector, obsoperator) + + # In save_and_submit, the posterior statevector will be added to the savestate.nc file, and it is added to the copy list. + # This way, we have both the prior and posterior data from another run copied into this assimilation, for later analysis. + + save_and_submit(dacycle, statevector) + + logging.info("Cycle finished...exiting pipeline") +#################################################################################################### + +def analysis_pipeline(dacycle, platform, dasystem, samples, statevector): + """ Main entry point for analysis of ctdas results """ + + from da.analysis.cteco2.expand_fluxes import save_weekly_avg_1x1_data, save_weekly_avg_state_data, save_weekly_avg_tc_data, save_weekly_avg_ext_tc_data, save_weekly_avg_agg_data + from da.analysis.cteco2.expand_molefractions import write_mole_fractions + from da.analysis.cteco2.summarize_obs import summarize_obs + from da.analysis.cteco2.time_avg_fluxes import time_avg + + logging.info(header + "Starting analysis" + footer) + + dasystem.validate() + dacycle.dasystem = dasystem + dacycle.daplatform = platform + dacycle.setup() + statevector.setup(dacycle) + + logging.info(header + "Starting mole fractions" + footer) + + write_mole_fractions(dacycle) + summarize_obs(dacycle['dir.analysis']) + + logging.info(header + "Starting weekly averages" + footer) + + save_weekly_avg_1x1_data(dacycle, statevector) + save_weekly_avg_state_data(dacycle, statevector) + save_weekly_avg_tc_data(dacycle, statevector) + save_weekly_avg_ext_tc_data(dacycle) + save_weekly_avg_agg_data(dacycle,region_aggregate='transcom') + save_weekly_avg_agg_data(dacycle,region_aggregate='transcom_extended') + save_weekly_avg_agg_data(dacycle,region_aggregate='olson') + save_weekly_avg_agg_data(dacycle,region_aggregate='olson_extended') + save_weekly_avg_agg_data(dacycle,region_aggregate='country') + + logging.info(header + "Starting monthly and yearly averages" + footer) + + time_avg(dacycle,'flux1x1') + time_avg(dacycle,'transcom') + time_avg(dacycle,'transcom_extended') + time_avg(dacycle,'olson') + time_avg(dacycle,'olson_extended') + time_avg(dacycle,'country') + + logging.info(header + "Finished analysis" + footer) + + + +def archive_pipeline(dacycle, platform, dasystem): + """ Main entry point for archiving of output from one disk/system to another """ + + if not 'task.rsync' in dacycle: + logging.info('rsync task not found, not starting automatic backup...') + return + else: + logging.info('rsync task found, starting automatic backup...') + + for task in dacycle['task.rsync'].split(): + sourcedirs = dacycle['task.rsync.%s.sourcedirs'%task] + destdir = dacycle['task.rsync.%s.destinationdir'%task] + + rsyncflags = dacycle['task.rsync.%s.flags'%task] + + # file ID and names + jobid = dacycle['time.end'].strftime('%Y%m%d') + targetdir = os.path.join(dacycle['dir.exec']) + jobfile = os.path.join(targetdir, 'jb.rsync.%s.%s.jb' % (task,jobid) ) + logfile = os.path.join(targetdir, 'jb.rsync.%s.%s.log' % (task,jobid) ) + # Template and commands for job + jobparams = {'jobname':"r.%s" % jobid, 'jobnodes': '1', 'jobtime': '1:00:00', 'joblog': logfile, 'errfile': logfile} + + if platform.ID == 'cartesius': + jobparams['jobqueue'] = 'staging' + + template = platform.get_job_template(jobparams) + for sourcedir in sourcedirs.split(): + execcommand = "\nrsync %s %s %s\n" % (rsyncflags, sourcedir,destdir,) + template += execcommand + + # write and submit + platform.write_job(jobfile, template, jobid) + jobid = platform.submit_job(jobfile, joblog=logfile) + + + +def start_job(dacycle, dasystem, platform, statevector, samples, obsoperator): + """ Set up the job specific directory structure and create an expanded rc-file """ + + dasystem.validate() + dacycle.dasystem = dasystem + dacycle.daplatform = platform + dacycle.setup() + #statevector.dacycle = dacycle # also embed object in statevector so it can access cycle information for I/O etc + #samples.dacycle = dacycle # also embed object in samples object so it can access cycle information for I/O etc + #obsoperator.dacycle = dacycle # also embed object in obsoperator object so it can access cycle information for I/O etc + obsoperator.setup(dacycle) # Setup Observation Operator + statevector.setup(dacycle) + + +def prepare_state(dacycle, statevector): + """ Set up the input data for the forward model: obs and parameters/fluxes""" + + # We now have an empty statevector object that we need to populate with data. If this is a continuation from a previous cycle, we can read + # the previous statevector values from a NetCDF file in the restart directory. If this is the first cycle, we need to populate the statevector + # with new values for each week. After we have constructed the statevector, it will be propagated by one cycle length so it is ready to be used + # in the current cycle + + logging.info(header + "starting prepare_state" + footer) + + if not dacycle['time.restart']: + + # Fill each week from n=1 to n=nlag with a new ensemble + for n in range(statevector.nlag): + date = dacycle['time.start'] + datetime.timedelta(days=(n + 0.5) * int(dacycle['time.cycle'])) + cov = statevector.get_covariance(date, dacycle) + statevector.make_new_ensemble(n, cov) + + else: + # Read the statevector data from file + #saved_sv = os.path.join(dacycle['dir.restart.current'], 'savestate.nc') + saved_sv = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['da.restart.tstamp'].strftime('%Y%m%d')) + statevector.read_from_file(saved_sv) # by default will read "opt"(imized) variables, and then propagate + + # Now propagate the ensemble by one cycle to prepare for the current cycle + statevector.propagate(dacycle) + + # Finally, also write the statevector to a file so that we can always access the a-priori information + current_sv = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['time.start'].strftime('%Y%m%d')) + statevector.write_to_file(current_sv, 'prior') # write prior info + + + +def sample_state(dacycle, samples, statevector, obsoperator): + """ Sample the filter state for the inversion """ + + # Before a forecast step, save all the data to a save/tmp directory so we can later recover it before the propagation step. + # This is especially important for: + # (i) The transport model restart data which holds the background mole fractions. This is needed to run the model one cycle forward + # (ii) The random numbers (or the seed for the random number generator) so we can recreate the ensembles if needed + + #status = dacycle.MoveSaveData(io_option='store',save_option='partial',filter=[]) + #msg = "All restart data have been copied to the save/tmp directory for future use" ; logging.debug(msg) + logging.info(header + "starting sample_state" + footer) + nlag = int(dacycle['time.nlag']) + logging.info("Sampling model will be run over %d cycles" % nlag) + + obsoperator.get_initial_data() + + for lag in range(nlag): + logging.info(header + ".....Ensemble Kalman Filter at lag %d" % (lag + 1)) + + ############# Perform the actual sampling loop ##################### + + sample_step(dacycle, samples, statevector, obsoperator, lag) + + logging.debug("statevector now carries %d samples" % statevector.nobs) + + + +def sample_step(dacycle, samples, statevector, obsoperator, lag, advance=False): + """ Perform all actions needed to sample one cycle """ + + # First set up the information for time start and time end of this sample + dacycle.set_sample_times(lag) + + startdate = dacycle['time.sample.start'] + enddate = dacycle['time.sample.end'] + dacycle['time.sample.window'] = lag + dacycle['time.sample.stamp'] = "%s_%s" % (startdate.strftime("%Y%m%d%H"), enddate.strftime("%Y%m%d%H")) + + logging.info("New simulation interval set : ") + logging.info(" start date : %s " % startdate.strftime('%F %H:%M')) + logging.info(" end date : %s " % enddate.strftime('%F %H:%M')) + logging.info(" file stamp: %s " % dacycle['time.sample.stamp']) + + + # Implement something that writes the ensemble member parameter info to file, or manipulates them further into the + # type of info needed in your transport model + + statevector.write_members_to_file(lag, dacycle['dir.input'], obsoperator=obsoperator) + + for sample in samples: + + sample.setup(dacycle) + + # Read observations + perform observation selection + sample.add_observations() + + # Add model-data mismatch to all samples, this *might* use output from the ensemble in the future?? + sample.add_model_data_mismatch(dacycle.dasystem['obs.sites.rc']) + + sampling_coords_file = os.path.join(dacycle['dir.input'], sample.get_samples_type()+'_coordinates_%s.nc' % dacycle['time.sample.stamp']) + logging.debug('sampling_coords_file = %s' % sampling_coords_file) + sample.write_sample_coords(sampling_coords_file) + + # Write filename to dacycle, and to output collection list + dacycle['ObsOperator.inputfile.'+sample.get_samples_type()] = sampling_coords_file + + del sample + + # Run the observation operator + obsoperator.run_forecast_model(samples) + + + # Read forecast model samples that were written to NetCDF files by each member. Add them to the exisiting + # Observation object for each sample loop. This data fill be written to file in the output folder for each sample cycle. + + # We retrieve all model samples from one output file written by the ObsOperator. If the ObsOperator creates + # one file per member, some logic needs to be included to merge all files!!! + + for i in range(len(samples)): + if os.path.exists(dacycle['ObsOperator.inputfile.'+samples[i].get_samples_type()]): + samples[i].add_simulations(obsoperator.simulated_file[i]) + logging.debug("Simulated values read, continuing pipeline") + else: + logging.warning("No simulations added, because sample input file does not exist") + + # Now add the observations that need to be assimilated to the statevector. + # Note that obs will only be added to the statevector if either this is the first step (restart=False), or lag==nlag + # This is to make sure that the first step of the system uses all observations available, while the subsequent + # steps only optimize against the data at the front (lag==nlag) of the filter. This way, each observation is used only + # (and at least) once # in the assimilation + + + if not advance: + logging.debug('time.restart = %s' %dacycle['time.restart']) + if dacycle['time.restart'] == False or lag == int(dacycle['time.nlag']) - 1: + statevector.obs_to_assimilate += (copy.deepcopy(samples),) + for sample in samples: + statevector.nobs += sample.getlength() + del sample + logging.info('nobs = %i' %statevector.nobs) + logging.debug("Added samples from the observation operator to the assimilated obs list in the statevector") + + else: + statevector.obs_to_assimilate += (None,) + + + +def invert(dacycle, statevector, optimizer): + """ Perform the inverse calculation """ + logging.info(header + "starting invert" + footer) + + if statevector.nobs <= 1: #== 0: + logging.info('List with observations to assimilate is empty, skipping invert step and continuing without statevector update...') + return + + dims = (int(dacycle['time.nlag']), + int(dacycle['da.optimizer.nmembers']), + int(dacycle.dasystem['nparameters']), + statevector.nobs) + + if 'opt.algorithm' not in dacycle.dasystem: + logging.info("There was no minimum least squares algorithm specified in the DA System rc file (key : opt.algorithm)") + logging.info("...using serial algorithm as default...") + optimizer.set_algorithm('Serial') + elif dacycle.dasystem['opt.algorithm'] == 'serial': + logging.info("Using the serial minimum least squares algorithm to solve ENKF equations") + optimizer.set_algorithm('Serial') + elif dacycle.dasystem['opt.algorithm'] == 'bulk': + logging.info("Using the bulk minimum least squares algorithm to solve ENKF equations") + optimizer.set_algorithm('Bulk') + + optimizer.setup(dims) + optimizer.state_to_matrix(statevector) + + diagnostics_file = os.path.join(dacycle['dir.output'], 'optimizer.%s.nc' % dacycle['time.start'].strftime('%Y%m%d')) + + optimizer.write_diagnostics(diagnostics_file, 'prior') + optimizer.set_localization(dacycle['da.system.localization']) + + if optimizer.algorithm == 'Serial': + optimizer.serial_minimum_least_squares() + else: + optimizer.bulk_minimum_least_squares() + + optimizer.matrix_to_state(statevector) + optimizer.write_diagnostics(diagnostics_file, 'optimized') + + + +def advance(dacycle, samples, statevector, obsoperator): + """ Advance the filter state to the next step """ + + # This is the advance of the modeled CO2 state. Optionally, routines can be added to advance the state vector (mean+covariance) + + # Then, restore model state from the start of the filter + logging.info(header + "starting advance" + footer) + logging.info("Sampling model will be run over 1 cycle") + + obsoperator.get_initial_data() + + sample_step(dacycle, samples, statevector, obsoperator, 0, True) + + dacycle.restart_filelist.extend(obsoperator.restart_filelist) + dacycle.output_filelist.extend(obsoperator.output_filelist) + logging.debug("Appended ObsOperator restart and output file lists to dacycle for collection ") + + for sample in samples: + dacycle.output_filelist.append(dacycle['ObsOperator.inputfile.'+sample.get_samples_type()]) + logging.debug("Appended Observation filename to dacycle for collection: %s"%(dacycle['ObsOperator.inputfile.'+sample.get_samples_type()])) + + sampling_coords_file = os.path.join(dacycle['dir.input'], sample.get_samples_type()+'_coordinates_%s.nc' % dacycle['time.sample.stamp']) + if os.path.exists(sampling_coords_file): + outfile = os.path.join(dacycle['dir.output'], sample.get_samples_type() + '_auxiliary_%s.nc' % dacycle['time.sample.stamp']) + sample.write_sample_auxiliary(outfile) + else: logging.warning("Sample auxiliary output not written, because input file does not exist (no samples found in obspack)") + del sample + + + +def save_and_submit(dacycle, statevector): + """ Save the model state and submit the next job """ + logging.info(header + "starting save_and_submit" + footer) + + filename = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['time.start'].strftime('%Y%m%d')) + statevector.write_to_file(filename, 'opt') + + dacycle.output_filelist.append(filename) + dacycle.finalize() diff --git a/da/rc/wrfchem/dasystem_wrfchem.rc b/da/rc/wrfchem/dasystem_wrfchem.rc new file mode 100644 index 00000000..fc85dcf7 --- /dev/null +++ b/da/rc/wrfchem/dasystem_wrfchem.rc @@ -0,0 +1,77 @@ +! CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +! Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +! updates of the code. See also: http://www.carbontracker.eu. +! +! This program is free software: you can redistribute it and/or modify it under the +! terms of the GNU General Public License as published by the Free Software Foundation, +! version 3. This program is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +! FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along with this +! program. If not, see <http://www.gnu.org/licenses/>. + +!!! Info for the CarbonTracker data assimilation system + +!datadir : /projects/0/ctdas/input/ctdas_2012/ + + +! For ObsPack +! For WRF, this is only a dummy until sampling point observations is +! implemented +!obspack.input.id : obspack_co2_1_GLOBALVIEWplus_v2.1_2016-09-02 +!obspack.input.dir : ${datadir}/obspacks/${obspack.input.id} +!obs.sites.rc : ${obspack.input.dir}/summary/sites_weights_Dec2016.rc +obs.sites.rc : dummy + +! For column data +! sample.in.ctdas is obsolete, should be removed +sample.in.ctdas : False +! obs.column.rc here only contains "obs.rejection.threshold : <N>" +obs.column.rc : /path/to/xco2_weights.rc +obs.column.input.dir : /dir/with/column/observation/files +obs.column.ncfile : column_observation_file_pattern_<YYYYMMDD>.nc + +! Selection criteria based on variables stored in data file +obs.column.selection.variables : quality_flag +obs.column.selection.criteria : == 0 + +! Any value but "parametrization" defaults to model uncertainty of 1 +! (in the units of the observations, so e.g. 1 ppm for CO2) +mdm.calculation : default + +! Covariances +! Scaling factor factor sigma, no covariances +! A value of 1 means 100% prior flux uncertainty. +! For one flux process: +! sigma_scale : 1 +! For multiple flux processes (see n_emis_proc below): +sigma_scale_1 : 0.3 +sigma_scale_2 : 0.5 +sigma_scale_3 : 0.5 + +! Uncomment sigma_offset for optimizing a constant offset of WRF initial +! and boundary conditions. Unit as in observations, so e.g. ppm for CO2. +! This is only allowed in runs with only one cycle, because in +! subsequent cycles, the initial conditions couldn't be optimized in a +! consistent way. +! sigma_offset : 0.5 + +! Seed for numpy random number generator? +!random.seed : 4385 +!random.seed.init: ${datadir}/randomseedinit.pickle +random.seed.init: /path/to/randomseedinit.pickle + + +!!!!!!!!!!!!!!!!!! STATEVECTOR !!!!!!!!!!!!!!!!!! +! nparameters = number of flux parameters plus 1 in case an offset is +! optimized +! Example: +! For 25 state vector elements and 3 processes (e.g. GPP, respiration, +! anthropogenic emissions), set nparameters to 75. If, in addition, an +! initial/boundary condition offset is optimized by setting +! sigma_offset, set nparameters to 76. + +nparameters : 75 +n_emis_proc : 3 +regionsfile : /path/to/regionsfile.nc diff --git a/da/rc/wrfchem/obsoper_wrfchem.rc b/da/rc/wrfchem/obsoper_wrfchem.rc new file mode 100644 index 00000000..d0cb1cc9 --- /dev/null +++ b/da/rc/wrfchem/obsoper_wrfchem.rc @@ -0,0 +1,141 @@ +! CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +! Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +! updates of the code. See also: http://www.carbontracker.eu. +! +! This program is free software: you can redistribute it and/or modify it under the +! terms of the GNU General Public License as published by the Free Software Foundation, +! version 3. This program is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +! FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along with this +! program. If not, see <http://www.gnu.org/licenses/>. + + +! source_dir +! Directory like WRF/run/ containing everything that is necessary to run WRF, +! having already run real.exe: +! - wrf.exe +! - namelist.input +! - wrfbdy_d01 (already containing values for each ensemble member) +! - wrfinput_dXX (already containing values for each ensemble member) +! - wrfchemi_* (containing at the prior fluxes in field flux_optim and +! flux_fix (if defined), and fields for ensemble members) + +source_dir : /dir/like/WRF/run/with/all/inputs/ + +! run_dir +! The directory where all WRF runs will be performed. +! Files from source_dir will be copied/linked here as needed. +run_dir : /dir/for/running/WRF/within/CTDAS/ + +! tracer and flux to optimize (as in ensemble tracer definitions in WRF, but +! without the _NNN suffix for the member number) +tracer_optim : CO2 +flux_optim : E_CO2 + +! Fixed flux to add to fluxes to optimize (comment if there are none) +! flux_fix : E_FIX + +! obs.column.footprint_samples_dim (do not change) +! For high resolution simulations, the satellite footprint can be bigger +! than one model pixel. If this option is 1, the model atmosphere is +! sampled at the center coordinates of the observation. If this option +! is > 1, the model atmosphere is sampled at multiple points within the +! footprint corners and the result is averaged. The sampling points are +! aranged as an n x n rectangle. +! Example: if obs.column.footprint_samples_dim=3, the model is sampled +! at 3*3=9 points in the footprint area. +! Must be set to 1. Code for larger values is in preparation. +obs.column.footprint_samples_dim : 1 + + +! Option to not run WRF in case output files are already present +! for the current time period. This is a development option that +! is useful for testing something in CTDAS after the transport +! model is run. +! True/False (default if not set: False) +! no_rerun_wrf : False + +! Option to save a copy of wrfout*, wrfrst* and rsl* files each time +! WRF is run. This is another option for development/debugging and not +! for production. The purpose is to rerun CTDAS without actually running the +! transport model, because WRF usually takes a long time to run. It can also +! be used to check wrfout contents of a cycle, because they get overwritten +! by subsequent steps. I use it to debug the optimization step without +! rerunning wrf for the da step for each try. So far this recovery step +! needs to be done manually, though. So for a new run I copy the wrfout +! files back to the WRF run directory, and set no_rerun_wrf (above) to +! True. Then I start CTDAS and it will see that the wrfout for the period +! already exist and skip running WRF. Note that this will also happen in +! the advance step, because wrfout files for its period already exist. If +! you want CTDAS to do a proper advance step, you need to wait for the +! optimization to start, then while it is still running move the wrfout files +! out of the run directory again. Then when the advance step starts, it will +! rerun WRF because it doesn't find wrfout files. +save_wrf_step : False + +! WRF-Chem and sampling its output are implemented as subprocesses (job +! steps). All remaining options in this file deal with starting them. +! These options and definitions could be made part of the platform +! object in a future update. However, in the original CTDAS, running the +! transport model TM5 is also controlled by TM5-specific options. +! The options here work with the SLURM scheduler on the Dutch national +! supercomputer cartesius. + +! The processes are started with the python module subprocess, which +! takes a shell command as option. The shell command for WRF is: +! run_command + " " (+ run_ntask_opt + ntasks) (+ " " + run_t_opt + run_t) + " ./wrf.exe >& log/wrf.log" +! The shell command for python scripts is: +! run_command_1task + " " (+ run_mem_opt + <mem_str> + mem_unit_str_MB) (+ " " + run_t_opt + run_t) + " python <python skript and options>" +! Variables enclosed by "<>" are computed online. The others are set +! below. Options in brackets are only appended to the command string if +! they are not empty in the settings below. + +! Command to start external process +run_command : srun + +! Some python scripts are launched that require only 1 task. +! For this case, the scheduler SLURM requires to specifically request +! -N1 because otherwise slurm apparently sometimes tries to allocate +! one task to more than one node. See here: +! https://stackoverflow.com/questions/24056961/running-slurm-script-with-multiple-nodes-launch-job-steps-with-1-task +run_command_1task : srun -n1 -N1 + +! Define how to pass options to processes started with run_command or +! run_command_1task +! Name of the option of run_command that defines requested memory +! (total, not per cpu). If this is empty, memory limit will not be +! specified +run_mem_opt : --mem= + +! The operator computes memory in MB, might have to add a string +mem_unit_str_MB : M + +! Name of the option of run_command that defines requested wallclock time +! If this is empty, wallclock limit will not be specified. +run_t_opt : -t + +! Name of the option of run_command that defines number of requested tasks +! If this is empty, number of tasks will not be specified +run_ntask_opt : -n + +! Time that a subprocess should request (this is probably not important +! because it's run within the same resource allocation as the parent - +! not tested working without though) +run_t : 0:20:00 + +! Number of tasks that can run on one node +! This is used for computing how much memory to allocate per process of +! calls wrfout_sampler. +cpus_per_node : 24 + +! Available memory per node +! mem per node: On cartesius technically 64, but at least in interactive +! shells on cartesius a bit of a buffer is needed. The value here was +! determined with calls like "srun --mem=61G -n1 sleep 1" inside an +! interactive shell. +mem_per_node : 60 + + + diff --git a/da/statevectors/statevector_wrfchem.py b/da/statevectors/statevector_wrfchem.py new file mode 100644 index 00000000..6236b22e --- /dev/null +++ b/da/statevectors/statevector_wrfchem.py @@ -0,0 +1,332 @@ +"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +updates of the code. See also: http://www.carbontracker.eu. + +This program is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software Foundation, +version 3. This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this +program. If not, see <http://www.gnu.org/licenses/>.""" +#!/usr/bin/env python +# ct_statevector_tools.py + +""" +.. module:: statevector +.. moduleauthor:: Friedemann Reum + +Revision History: +File created 2019. + +This module is an extension to any CTDAS statevector that overwrites +write_members_to_file to produce output in the format required by WRF-Chem. +Thus, it can inherit from any other +state vector. In this reference implementation, it inherits from +the baseclass statevector. + * :class:`~da.baseclasses.statevector.StateVector` + * :class:`~da.baseclasses.statevector.EnsembleMember` + +As usual, specific implementations of StateVector objects are done through inheritance form these baseclasses. An example of designing +your own baseclass StateVector we refer to :ref:`tut_chapter5`. + +.. autoclass:: da.baseclasses.statevector.StateVector + +.. autoclass:: da.baseclasses.statevector.EnsembleMember + +""" + +import os +import logging +import numpy as np +from datetime import timedelta +import da.tools.io4 as io +from da.statevectors.statevector_baseclass import StateVector, EnsembleMember + +identifier = 'Statevector for WRFChem' +version = '0.0' + +################### Begin Class WRFChemStateVector ################### +class WRFChemStateVector(StateVector): + """ + This module is an extension to any CTDAS statevector that overwrites + write_members_to_file to produce output in the format required by WRF-Chem. + Thus, it can inherit from any other state vector. In the current + implementation, it inherits from CO2GriddedStateVector. + + .. automethod:: da.baseclasses.statevector.StateVector.write_members_to_file + + """ + + def __init__(self): + self.ID = identifier + self.version = version + logging.info('Statevector object initialized: %s (%s)' % + (self.ID, self.version)) + + def setup(self, dacycle): + """ + Copied from baseclasses.statevector and + - Replaced transcom region with a dummy. + - Added do_offset + - Added overlaying processes + """ + + self.nlag = int(dacycle['time.nlag']) + self.nmembers = int(dacycle['da.optimizer.nmembers']) + self.nparams = int(dacycle.dasystem['nparameters']) + self.do_offset = "sigma_offset" in list(dacycle.dasystem.keys()) + # >>> Overlaying processes: + self.n_emis_proc = int(dacycle.dasystem['n_emis_proc']) + # Get number of flux parameters per process + nparams_proc = float(self.nparams - int(self.do_offset)) / float(self.n_emis_proc) + if int(nparams_proc)==nparams_proc: + self.nparams_proc = int(nparams_proc) + else: + raise ValueError("Number of emission processes doesn't divide number of flux parameters") + # <<< Overlaying processes + + self.nobs = 0 + + self.obs_to_assimilate = () # empty containter to hold observations to assimilate later on + + # These list objects hold the data for each time step of lag in the system. Note that the ensembles for each time step consist + # of lists of EnsembleMember objects, we define member 0 as the mean of the distribution and n=1,...,nmembers as the spread. + + self.ensemble_members = list(range(self.nlag)) + + for n in range(self.nlag): + self.ensemble_members[n] = [] + + + # This specifies the file to read with the gridded mask at 1x1 degrees. Each gridbox holds a number that specifies the parametermember + # that maps onto it. From this map, a dictionary is created that allows a reverse look-up so that we can map parameters to a grid. + + mapfile = os.path.join(dacycle.dasystem['regionsfile']) + ncf = io.ct_read(mapfile, 'read') + self.gridmap = ncf.get_variable('regions').astype(int) + # >>> edit freum: Not using transcom regions mapping in regional setup with wrfchem + #self.tcmap = ncf.get_variable('transcom_regions') + # <<< edit freum + ncf.close() + + #logging.debug("A TransCom map on 1x1 degree was read from file %s" % dacycle.dasystem['regionsfile']) + #logging.debug("A parameter map on 1x1 degree was read from file %s" % dacycle.dasystem['regionsfile']) + logging.debug("A parameter map was read from file %s" % dacycle.dasystem['regionsfile']) + + # Create a dictionary for state <-> gridded map conversions + + nparams = self.gridmap.max() + + # >>> do_offset + # Check that nparams is compatible with the one + # given in dasystem rc-file + + # if nparams + int(self.do_offset) != self.nparams: + if nparams*self.n_emis_proc + int(self.do_offset) != self.nparams: # overlaying parameters + raise ValueError("Number of parameters in regionsfile incompatible with nparameters in dasystem-rcfile.") + # <<< do_offset + + self.griddict = {} + for r in range(1, int(nparams) + 1): + sel = np.nonzero(self.gridmap.flatten() == r) + if len(sel[0]) > 0: + self.griddict[r] = sel + + logging.debug("A dictionary to map grids to states and vice versa was created") + + # Create a matrix for state <-> TransCom conversions + # >>> edit freum + # Regional simulations (wrfchem): only dummy tcmatrix. Would only be used in analysis pipeline, which I don't run for wrfchem. + + self.tcmatrix = np.zeros((self.nparams, 23), 'float') + + # for r in range(1, self.nparams + 1): + # sel = np.nonzero(self.gridmap.flatten() == r) + # if len(sel[0]) < 1: + # continue + # else: + # n_tc = set(self.tcmap.flatten().take(sel[0])) + # if len(n_tc) > 1: + # logging.error("Parameter %d seems to map to multiple TransCom regions (%s), I do not know how to handle this" % (r, n_tc)) + # raise ValueError + # self.tcmatrix[r - 1, int(n_tc.pop()) - 1] = 1.0 + + # logging.debug("A matrix to map states to TransCom regions and vice versa was created") + logging.debug("A dummy matrix to map states to TransCom regions and vice versa was created") + # <<< edit freum + + # Create a mask for species/unknowns + + self.make_species_mask() + + def make_new_ensemble(self, lag, covariancematrix=None): + """ + Copied from baseclass statevector, but switched to + fresh initialization of each new ensemble (instead of + with previous lags) and handling of self.do_offset. + + """ + + if covariancematrix is None: + covariancematrix = np.identity(self.nparams) + + # Make a cholesky decomposition of the covariance matrix + + + try: + _, s, _ = np.linalg.svd(covariancematrix) + except: + s = np.linalg.svd(covariancematrix, full_matrices=1, compute_uv=0) #Cartesius fix + dof = np.sum(s) ** 2 / sum(s ** 2) + C = np.linalg.cholesky(covariancematrix) + + logging.debug('Cholesky decomposition has succeeded ') + logging.info('Appr. degrees of freedom in covariance matrix is %s' % (int(dof))) + + + # Create mean values + + newmean = np.ones(self.nparams, float) # standard value for a new time step is 1.0 + + # >>> do_offset + if self.do_offset: + newmean[-1] = 0. + # <<< do_offset + + # If this is not the start of the filter, average previous two optimized steps into the mix + + # freum vvvv + + # Switch off the initialization with previously optimized state vector + # for my project + #if lag == self.nlag - 1 and self.nlag >= 3: + # newmean += self.ensemble_members[lag - 1][0].param_values + \ + # self.ensemble_members[lag - 2][0].param_values + # newmean = newmean / 3.0 + + # freum ^^^^ + + # Create the first ensemble member with a deviation of 0.0 and add to list + + newmember = EnsembleMember(0) + newmember.param_values = newmean.flatten() # no deviations + self.ensemble_members[lag].append(newmember) + + # Create members 1:nmembers and add to ensemble_members list + + for member in range(1, self.nmembers): + rands = np.random.randn(self.nparams) + + newmember = EnsembleMember(member) + newmember.param_values = np.dot(C, rands) + newmean + self.ensemble_members[lag].append(newmember) + + logging.debug('%d new ensemble members were added to the state vector # %d' % (self.nmembers, (lag + 1))) + + + def write_members_to_file(self, lag, outdir, endswith='.nc', obsoperator=None): + """ + :param: lag: Which lag step of the filter to write, must lie in range [1,...,nlag] + :param: outdir: Directory where to write files + :param: obsoperator: observationoperator object + :param: endswith: Optional label to add to the filename, default is simply .nc + :rtype: None + + Write ensemble member information to file for transport model. + If observation operator is None or its ID is anything except + "wrfchem", the method of the baseclass is called. If the + observation operator ID is "wrfchem", the method of the + observation operator is called. The reason to implement it + there was originally to avoid teaching the statevector + object about WRF domains. + """ + + # Decide what to call based on obsoperator + if obsoperator is None: + # logging.info("Calling statevector._write_members_to_file_standard") + # self._write_members_to_file_standard(lag, outdir, endswith) + logging.info("Calling super().write_members_to_file") + super().write_members_to_file(lag, outdir, endswith, obsoperator) + else: + if obsoperator.ID=='wrfchem': + logging.info("Calling obsoperator.write_members_to_file") + obsoperator.write_members_to_file(lag, outdir, self) + else: # call the standard method + # logging.info("Calling statevector._write_members_to_file_standard") + # self._write_members_to_file_standard(lag, outdir, endswith) + logging.info("Calling super().write_members_to_file") + super().write_members_to_file(lag, outdir, endswith, obsoperator) + + + # def _write_members_to_file_standard(self, lag, outdir, endswith='.nc'): + # """ + # Copy of baseclass write_members_to_file. Documented in baseclass + # statevector. + # Obsolete, replaced with calling super().write_members_to_file() + # """ + # + # members = self.ensemble_members[lag] + # + # for mem in members: + # filename = os.path.join(outdir, 'parameters.%03d%s' % (mem.membernumber, endswith)) + # ncf = io.CT_CDF(filename, method='create') + # dimparams = ncf.add_params_dim(self.nparams) + # dimgrid = ncf.add_latlon_dim() + # + # data = mem.param_values + # + # savedict = io.std_savedict.copy() + # savedict['name'] = "parametervalues" + # savedict['long_name'] = "parameter_values_for_member_%d" % mem.membernumber + # savedict['units'] = "unitless" + # savedict['dims'] = dimparams + # savedict['values'] = data + # savedict['comment'] = 'These are parameter values to use for member %d' % mem.membernumber + # ncf.add_data(savedict) + # + # griddata = self.vector2grid(vectordata=data) + # + # savedict = io.std_savedict.copy() + # savedict['name'] = "parametermap" + # savedict['long_name'] = "parametermap_for_member_%d" % mem.membernumber + # savedict['units'] = "unitless" + # savedict['dims'] = dimgrid + # savedict['values'] = griddata.tolist() + # savedict['comment'] = 'These are gridded parameter values to use for member %d' % mem.membernumber + # ncf.add_data(savedict) + # + # ncf.close() + # + # logging.debug('Successfully wrote data from ensemble member %d to file (%s) ' % (mem.membernumber, filename)) + + def get_covariance(self, date, dacycle): + """ + Return a prior statevector covariance matrix + Here, the covariance matrix is diagonal and the values + are the squares of the "sigma" parameters in the + dasystem rc file. + """ + + if self.n_emis_proc==1: + sigma = float(dacycle.dasystem['sigma_scale']) + covariancematrix = np.identity(self.nparams) * sigma**2 + else: + # overlaying parameters: + covariancematrix = np.identity(self.nparams) + for nproc in range(1, self.n_emis_proc+1): + sigma_nproc = float(dacycle.dasystem['sigma_scale_' + str(nproc)]) + ind = list(range((nproc-1)*self.nparams_proc, nproc*self.nparams_proc)) + covariancematrix[ind, ind] = sigma_nproc**2 + if self.do_offset: + covariancematrix[-1, -1] = float(dacycle.dasystem["sigma_offset"])**2 + + + return covariancematrix + +################### End Class StateVector ################### + +if __name__ == "__main__": + pass + diff --git a/da/tools/wrfchem/__init__.py b/da/tools/wrfchem/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/da/tools/wrfchem/utilities.py b/da/tools/wrfchem/utilities.py new file mode 100644 index 00000000..157149ff --- /dev/null +++ b/da/tools/wrfchem/utilities.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + +Created on Wed Sep 18 16:03:02 2019 + +@author: friedemann +""" + +import os +import glob +import logging +import subprocess +import tempfile +import copy +import netCDF4 as nc +import numpy as np + +class utilities(object): + """ + Collection of utilities for wrfchem observation operator + that do not depend on other CTDAS modules + """ + + def __init__(self): + pass + + @staticmethod + def get_slicing_ids(N, nproc, nprocs): + """ + Purpose + ------- + For parallel processing, figure out which samples to process + by this process. + + Parameters + ---------- + N : int + Length to slice + nproc : int + id of this process (0... nprocs-1) + nprocs : int + Number of processes that work on the task. + + Output + ------ + Slicing indices id0, id1 + Usage + ----- + ..code-block:: python + + id0, id1 = get_slicing_ids(N, nproc, nprocs) + field[id0:id1, ...] + """ + + f0 = float(nproc)/float(nprocs) + id0 = int(np.floor(f0*N)) + + f1 = float(nproc+1)/float(nprocs) + id1 = int(np.floor(f1*N)) + + if id0==id1: + raise ValueError("id0==id1. Probably too many processes.") + return id0, id1 + + + + @classmethod + def cat_ncfiles(cls, path, in_arg, cat_dim, out_file, in_pattern=False, rm_original=True): + """ + Combine output of all processes into 1 file + If in_pattern, a pattern is provided instead of a file list. + This has the advantage that it can be interpreted by the shell, + because there are problems with long argument lists. + + This calls ncrcat from the nco library. If nco is not available, + rewrite this function. Note: I first tried to do this with + "cdo cat", but it messed up sounding_id + (see https://code.mpimet.mpg.de/boards/1/topics/908) + """ + + # To preserve dimension names, we start from one of the existing + # slice files instead of a new file. + + # Do this in path to avoid long command line arguments and history + # entries in outfile. + cwd = os.getcwd() + os.chdir(path) + + if in_pattern: + if not isinstance(in_arg, str): + raise TypeError("in_arg must be a string if in_pattern is True.") + file_pattern = in_arg + in_files = glob.glob(file_pattern) + else: + if isinstance(in_arg, list): + raise TypeError("in_arg must be a list if in_pattern is False.") + in_files = in_arg + + if len(in_files) == 0: + logging.error("Nothing to do.") + # Change back to previous directory + os.chdir(cwd) + return + + # Sorting is important! + in_files.sort() + + # ncrcat needs total number of soundings, count + Nobs = 0 + for f in in_files: + ncf = nc.Dataset(f, "r") + Nobs += len(ncf.dimensions[cat_dim]) + ncf.close() + + # Cat files + cmd_ = "ncrcat -h -O -d " + cat_dim + ",0,%d"%(Nobs-1) + if in_pattern: + cmd = cmd_ + " " + file_pattern + " " + out_file + # If PIPE is used here, it gets clogged, and the process + # stops without error message (see also + # https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/) + # Hence, piping the output to a temporary file. + proc = subprocess.Popen(cmd, shell=True, + stdout=tempfile.TemporaryFile(), + stderr=tempfile.TemporaryFile()) + else: + cmdsplt = cmd_.split() + in_files + [out_file] + proc = subprocess.Popen(cmdsplt, stdout=tempfile.TemporaryFile(), stderr=tempfile.TemporaryFile()) + cmd = " ".join(cmdsplt) + + proc.wait() + + # This is probably useless since the output is piped to a + # tempfile. + retcode = cls.check_out_err(proc) + + if retcode != 0: + msg = "Something went wrong in the sampling. Command: " + cmd + logging.error(msg) + raise OSError(msg) + + # Delete slice files + if rm_original: + logging.info("Deleting slice files.") + for f in in_files: + os.remove(f) + + logging.info("Sampled WRF output written to file.") + + # Change back to previous directory + os.chdir(cwd) + + + @staticmethod + def check_out_err(process): + """Displays stdout and stderr, returns returncode of the + process. + """ + + # Get process messages + out, err = process.communicate() + + # Print output + def to_str(str_or_bytestr): + """If argument is of type str, return argument. If + argument is of type bytes, return decoded str""" + if isinstance(str_or_bytestr, str): + return str_or_bytestr + elif isinstance(str_or_bytestr, bytes): + return str(str_or_bytestr, 'utf-8') + else: + msg = "str_or_bytestr is " + str(type(str_or_bytestr)) + \ + ", should be str or bytestr." + raise TypeError(msg) + + logging.debug("Subprocess output:") + if out is None: + logging.debug("No output.") + elif isinstance(out, list): + for line in out: + logging.debug(to_str(line.rstrip())) + else: + logging.debug(to_str(out.rstrip())) + + # Handle errors + if process.returncode != 0: + logging.error("subprocess error") + logging.error("Returncode: %s", str(process.returncode)) + logging.error("Message, if any:") + if not err is None: + for line in err: + logging.error(line.rstrip()) + + return process.returncode + + @classmethod + def get_index_groups(cls, *args): + """ + Input: + numpy arrays with 1 dimension or lists, all same length + Output: + Dictionary of lists of indices that have the same + combination of input values. + """ + + try: + # If pandas is available, it makes a pandas DataFrame and + # uses its groupby-function. + import pandas as pd + + args_array = np.array(args).transpose() + df = pd.DataFrame(args_array) + groups = df.groupby(list(range(len(args)))).indices + + except ImportError: + # If pandas is not available, use an own implementation of groupby. + # Recursive implementation. It's fast. + args_array = np.array(args).transpose() + groups = cls._group(args_array) + + return groups + + @classmethod + def _group(cls, a): + """ + Reimplementation of pandas.DataFrame.groupby.indices because + py 2.7 on cartesius isn't compatible with pandas. + Unlike the pandas function, this always uses all columns of the + input array. + + Parameters + ---------- + a : numpy.ndarray (2D) + Array of indices. Each row is a combination of indices. + + Returns + ------- + groups : dict + The keys are the unique combinations of indices (rows of a), + the values are the indices of the rows of a equal the key. + """ + + # This is a recursive function: It makes groups according to the + # first columnm, then calls itself with the remaining columns. + # Some index juggling. + + # Group according to first column + UI = list(set(a[:, 0])) + groups0 = dict() + for ui in UI: + # Key must be a tuple + groups0[(ui, )] = [i for i, x in enumerate(a[:, 0]) if x == ui] + + if a.shape[1] == 1: + # If the array only has one column, we're done + return groups0 + else: + # If the array has more than one column, we group those. + groups = dict() + for ui in UI: + # Group according to the remaining columns + subgroups_ui = cls._group(a[groups0[(ui, )], 1:]) + # Now the index juggling: Add the keys together and + # locate values in the original array. + for key in list(subgroups_ui.keys()): + # Get indices of bigger array + subgroups_ui[key] = [groups0[(ui, )][n] for n in subgroups_ui[key]] + # Add the keys together + groups[(ui, ) + key] = subgroups_ui[key] + + return groups + + + @staticmethod + def apply_by_group(func, array, groups, grouped_args=None, *args, **kwargs): + """ + Apply function 'func' to a numpy array by groups of indices. + 'groups' can be a list of lists or a dictionary with lists as + values. + + If 'array' has more than 1 dimension, the indices in 'groups' + are for the first axis. + + If 'grouped_args' is not None, its members are added to + 'kwargs' after slicing. + + *args and **kwargs are passed through to 'func'. + + Example: + apply_by_group(np.mean, np.array([0., 1., 2.]), [[0, 1], [2]]) + Output: + array([0.5, 2. ]) + """ + + shape_in = array.shape + shape_out = list(shape_in) + shape_out[0] = len(groups) + array_out = np.ndarray(shape_out, dtype=array.dtype) + + if type(groups) == list: + # Make a dictionary + groups = {n: groups[n] for n in range(len(groups))} + + if not grouped_args is None: + kwargs0 = copy.deepcopy(kwargs) + for n in range(len(groups)): + k = list(groups.keys())[n] + + # Add additional arguments that need to be grouped to kwargs + if not grouped_args is None: + kwargs = copy.deepcopy(kwargs0) + for ka, v in grouped_args.items(): + kwargs[ka] = v[groups[k], ...] + + array_out[n, ...] = np.apply_along_axis(func, 0, array[groups[k], ...], *args, **kwargs) + + return array_out + diff --git a/da/tools/wrfchem/wrfchem_helper.py b/da/tools/wrfchem/wrfchem_helper.py new file mode 100644 index 00000000..b4b39929 --- /dev/null +++ b/da/tools/wrfchem/wrfchem_helper.py @@ -0,0 +1,990 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 22 15:03:02 2019 + +This class contains helper functions mainly for sampling WRF-Chem, but a +few are also needed by the WRF-Chem column observation operator. + +@author: friedemann +""" + +# Instructions for pylint: +# pylint: disable=too-many-instance-attributes +# pylint: disable=W0201 +# pylint: disable=C0301 +# pylint: disable=E1136 +# pylint: disable=E1101 + + +import os +import shutil +import re +import glob +import bisect +import copy +import numpy as np +import netCDF4 as nc +import datetime as dt +import wrf +import f90nml +import pickle + +# CTDAS modules +import da.tools.io4 as io +from da.tools.wrfchem.utilities import utilities + + +class WRFChemHelper(object): + """Contains helper functions for sampling WRF-Chem""" + def __init__(self, settings): + self.settings = settings + + def validate_settings(self, needed_items=[]): + """ + This is based on WRFChemOO._validate_rc + """ + + if len(needed_items)==0: + return + + for key in needed_items: + if key not in self.settings: + msg = "Missing a required value in settings: %s" % key + raise IOError(msg) + + @staticmethod + def read_namelist(dirname): + """Read run settings from namelist.input file in dirname""" + nml_file = os.path.join(dirname, "namelist.input") + namelist = f90nml.read(nml_file) + # Some values are lists when running with nested domains, but + # not when running with only one domain. Ensure here those + # values are always lists. + list_vars = ["e_we", + "e_sn", + "parent_id", + "parent_grid_ratio", + "i_parent_start", + "j_parent_start" + ] + for v in list_vars: + if not isinstance(namelist["domains"][v], list): + namelist["domains"][v] = [namelist["domains"][v]] + + return namelist + + @staticmethod + def get_pressure_boundaries_paxis(p_axis, p_surf): + """ + Arguments + --------- + p_axis (:class:`array-like`) + Pressure at mid points of layers + p_surf (:class:`numeric`) + Surface pressure + Output + ------ + Pressure at layer boundaries + """ + + pb = np.array([float("nan")]*(len(p_axis)+1)) + pb[0] = p_surf + + for nl in range(len(pb)-1): + pb[nl+1] = pb[nl] + 2*(p_axis[nl] - pb[nl]) + + return pb + + @staticmethod + def get_pressure_boundaries_znw(znw, p_surf, p_top): + """ + Arguments + --------- + ZNW (:class:`ndarray`) + Eta coordinates of z-staggered WRF grid. For each + observation (2D) + p_surf (:class:`ndarray`) + Surface pressure (1D) + p_top (:class:`ndarray`) + Top-of-atmosphere pressure (1D) + Output + ------ + Pressure at layer boundaries + + CAVEATS + ------- + Maybe I should rather use P_HYD? Well, Butler et al. 2018 + (https://www.geosci-model-dev-discuss.net/gmd-2018-342/) used + znu and surface pressure to compute "WRF midpoint layer + pressure". + + For WRF it would be more consistent to interpolate to levels. + See also comments in code. + """ + + return znw*(p_surf-p_top) + p_top + + + @staticmethod + def get_int_coefs(pb_ret, pb_mod, level_def): + """ + Computes a coefficients matrix to transfer a model profile onto + a retrieval pressure axis. + + If level_def=="layer_average", this assumes that profiles are + constant in each layer of the retrieval, bound by the pressure + boundaries pb_ret. In this case, the WRF model layer is treated + in the same way, and coefficients integrate over the assumed + constant model layers. This works with non-staggered WRF + variables (on "theta" points). However, this is actually not how + WRF is defined, and the implementation should be changed to + z-staggered variables. Details for this change are in a comment + at the beginning of the code. + + If level_def=="pressure_boundary" (IMPLEMENTATION IN PROGRESS), + assumes that profiles, kernel and pwf are defined at pressure + boundaries that don't have a thickness (this is how OCO-2 data + are defined, for example). In this case, the coefficients + linearly interpolate adjacent model level points. This is + incompatible with the treatment of WRF in the above-described + layer-average assumption, but is closer to how WRF is actually + defined. The exception is that pb_mod is still constructed and + non-staggered variables are not defined at psurf. This can only + be fixed by switching to z-staggered variables. + + In cases where retrieval surface pressure is higher than model + surface pressure, and in cases where retrieval top pressure is + lower than model top pressure, the model profile will be + extrapolated with constant tracer mixing ratios. In cases where + retrieval surface pressure is lower than model surface pressure, + and in cases where retrieval top pressure is higher than model + top pressure, only the parts of the model column that fall + within the retrieval presure boundaries are sampled. + + Arguments + --------- + pb_ret (:class:`array_like`) + Pressure boundaries of the retrieval column + pb_mod (:class:`array_like`) + Pressure boundaries of the model column + level_def (:class:`string`) + "layer_average" or "pressure_boundary" (IMPLEMENTATION IN + PROGRESS). Refers to the retrieval profile. + + Note 2021-09-13: Inspected code for pressure_boundary. + Should be correct. Interpolates linearly between two model + levels. + + + Returns + ------- + coefs (:class:`array_like`) + Integration coefficient matrix. Each row sums to 1. + + Usage + ----- + .. code-block:: python + + import numpy as np + pb_ret = np.linspace(900., 50., 5) + pb_mod = np.linspace(1013., 50., 7) + model_profile = 1. - np.linspace(0., 1., len(pb_mod)-1)**3 + coefs = get_int_coefs(pb_ret, pb_mod, "layer_average") + retrieval_profile = np.matmul(coefs, model_profile) + """ + + if level_def == "layer_average": + # This code assumes that WRF variables are constant in + # layers, but they are defined on levels. This can be seen + # for example by asking wrf.interplevel for the value of a + # variable that is defined on the mass grid ("theta points") + # at a pressure slightly higher than the pressure on its + # grid (wrf.getvar(ncf, "p")), it returns nan. So There is + # no extrapolation. There are no layers. There are only + # levels. + # In addition, this page here: + # https://www.openwfm.org/wiki/How_to_interpret_WRF_variables + # says that to find values at theta-points of a variable + # living on u-points, you interpolate linearly. That's the + # other way around from what I would do if I want to go from + # theta to staggered. + # WRF4.0 user guide: + # - ungrib can interpolate linearly in p or log p + # - real.exe comes with an extrap_type namelist option, that + # extrapolates constantly BELOW GROUND. + # This would mean the correct way would be to integrate over + # a piecewise-linear function. It also means that I really + # want the value at surface level, so I'd need the CO2 + # fields on the Z-staggered grid ("w-points")! Interpolate + # the vertical in p with wrf.interp1d, example: + # wrf.interp1d(np.array(rh.isel(south_north=1, west_east=0)), + # np.array(p.isel(south_north=1, west_east=0)), + # np.array(988, 970)) + # (wrf.interp1d gives the same results as wrf.interplevel, + # but the latter just doesn't want to work with single + # columns (32,1,1), it wants a dim>1 in the horizontal + # directions) + # So basically, I can keep using pb_ret and pb_mod, but it + # would be more accurate to do the piecewise-linear + # interpolation and the output matrix will have 1 more + # value in each dimension. + + # Calculate integration weights by weighting with layer + # thickness. This assumes that both axes are ordered + # psurf to ptop. + coefs = np.ndarray(shape=(len(pb_ret)-1, len(pb_mod)-1)) + coefs[:] = 0. + + # Extend the model pressure grid if retrieval encompasses + # more. + pb_mod_tmp = copy.deepcopy(pb_mod) + + # In case the retrieval pressure is higher than the model + # surface pressure, extend the lowest model layer. + if pb_mod_tmp[0] < pb_ret[0]: + pb_mod_tmp[0] = pb_ret[0] + + # In case the model doesn't extend as far as the retrieval, + # extend the upper model layer upwards. + if pb_mod_tmp[-1] > pb_ret[-1]: + pb_mod_tmp[-1] = pb_ret[-1] + + # For each retrieval layer, this loop computes which + # proportion falls into each model layer. + for nret in range(len(pb_ret)-1): + + # 1st model pressure boundary index = the one before the + # first boundary with lower pressure than high-pressure + # retrieval layer boundary. + model_lower = pb_mod_tmp < pb_ret[nret] + id_model_lower = model_lower.nonzero()[0] + id_min = id_model_lower[0]-1 + + # Last model pressure boundary index = the last one with + # higher pressure than low-pressure retrieval layer + # boundary. + model_higher = pb_mod_tmp > pb_ret[nret+1] + + id_model_higher = model_higher.nonzero()[0] + + if len(id_model_higher) == 0: + #id_max = id_min + raise ValueError("This shouldn't happen. Debug.") + else: + id_max = id_model_higher[-1] + + # By the way, in case there is no model level with + # higher pressure than the next retrieval level, + # id_max must be the same as id_min. + + # For each model layer, find out how much of it makes up this + # retrieval layer + for nmod in range(id_min, id_max+1): + if (nmod == id_min) & (nmod != id_max): + # Part of 1st model layer that falls within + # retrieval layer + coefs[nret, nmod] = pb_ret[nret] - pb_mod_tmp[nmod+1] + elif (nmod != id_min) & (nmod == id_max): + # Part of last model layer that falls within + # retrieval layer + coefs[nret, nmod] = pb_mod_tmp[nmod] - pb_ret[nret+1] + elif (nmod == id_min) & (nmod == id_max): + # id_min = id_max, i.e. model layer encompasses + # retrieval layer + coefs[nret, nmod] = pb_ret[nret] - pb_ret[nret+1] + else: + # Retrieval layer encompasses model layer + coefs[nret, nmod] = pb_mod_tmp[nmod] - pb_mod_tmp[nmod+1] + + coefs[nret, :] = coefs[nret, :]/sum(coefs[nret, :]) + + # I tested the code with many cases, but I'm only 99.9% sure + # it works for all input. Hence a test here that the + # coefficients sum to 1 and dump the data if not. + sum_ = np.abs(coefs.sum(1) - 1) + if np.any(sum_ > 2.*np.finfo(sum_.dtype).eps): + dump = dict(pb_ret=pb_ret, + pb_mod=pb_mod, + level_def=level_def) + fp = "int_coefs_dump.pkl" + with open(fp, "w") as f: + pickle.dump(dump, f, 0) + + msg_fmt = "Something doesn't sum to 1. Arguments dumped to: %s" + raise ValueError(msg_fmt % fp) + + elif level_def=="pressure_boundary": + #msg = "level_def is pressure_boundary. Implementation not complete." + ##logging.error(msg) + #raise ValueError(msg) + # Note 2021-09-13: Inspected the code. Should be correct. + + # Go back to pressure midpoints for model... + # Change this line to p_mod = pb_mod for z-staggered + # variables + p_mod = pb_mod[1:] - 0.5*np.diff(pb_mod) + + coefs = np.ndarray(shape=(len(pb_ret), len(pb_mod)-1)) + coefs[:] = 0. + + # For each retrieval pressure level, compute linear + # interpolation coefficients + for nret in range(len(pb_ret)): + nmod_list = (p_mod < pb_ret[nret]).nonzero()[0] + if(len(nmod_list)>0): + nmod = nmod_list[0] - 1 + if nmod==-1: + # Constant extrapolation at surface + nmod = 0 + coef = 1. + else: + # Normal case: + coef = (pb_ret[nret]-p_mod[nmod+1])/(p_mod[nmod]-p_mod[nmod+1]) + else: + # Constant extrapolation at atmosphere top + nmod = len(p_mod)-2 + coef=0. + + coefs[nret, nmod] = coef + coefs[nret, nmod+1] = 1.-coef + + else: + msg = "Unknown level_def: " + level_def + raise ValueError(msg) + + return coefs + + @staticmethod + def get_pressure_weighting_function(pressure_boundaries, rule): + """ + Compute pressure weighting function according to 'rule'. + Valid rules are: + - simple (=layer thickness) + - connor2008 (not implemented) + """ + if rule == 'simple': + pwf = np.abs(np.diff(pressure_boundaries)/np.ptp(pressure_boundaries)) + else: + raise NotImplementedError("Rule %s not implemented" % rule) + + return pwf + + def locate_domain(self, lat, lon): + """ + Input + ----- + lat: Single values or lists or np.arrays + lon: Single values or lists or np.arrays + + Output + ------ + - xy-coordinates in finest domain that contains the coordinates + - finest domain + + NOTE + ---- + self.open_wrf_location_files() must be run first + """ + + if not hasattr(self, "_loc_files"): + raise RuntimeError("Must call open_wrf_loc_files first.") + + # Work with arrays internally. + lat = np.array(lat, ndmin=1) + lon = np.array(lon, ndmin=1) + + # Get coordinates of the observation in all domains + ndomains = self.namelist["domains"]["max_dom"] + + # Get domain sizes in xy on mass (=unstaggered) grid (hence the -1) + dom_size_x = np.array(self.namelist['domains']['e_we'], dtype=float, ndmin=1) - 1 + dom_size_y = np.array(self.namelist['domains']['e_sn'], dtype=float, ndmin=1) - 1 + + # Initialize output + xy = np.zeros((len(lat), 2)) + xy[:] = np.nan + finest_domain = np.zeros((len(lat), ), int) + + # Since wrf.ll_to_xy takes very long, I save a bit of time here + # by starting in the finest domain and processing only + # observations that weren't previously found. + for n in range(ndomains-1, -1, -1): + # Get xy for this domain + # Only process what you haven't processed + sel = np.where(finest_domain == 0)[0] + + # In case all domains where set + if len(sel)==0: + break + + x, y = wrf.ll_to_xy(wrfin=self._loc_files[n], + latitude=lat[sel], + longitude=lon[sel], + meta=False, + as_int=False) + + + # For each domain, check if the observation is inside the + # domain extent + # I put the edges on -0.5 and e_we/e_sn - 0.5. + # Reason is that wrf.ll_to_xy(..., as_int = True) returns + # -0.2 as 0, so it's inside the domain. + # Also, for e_we=99, e_sn = 92, ref_lon=9, ref_lat=51.5: + # wrf.ll_to_xy(wrfin=self._loc_files[0], latitude=51.5, longitude=9., meta=False, as_int=False) + # array([48.49996124, 45.00003737]) + # This means that int indices denote exactly the mass-staggered point. + + # To be able to iterate over x and y in case they're scalars: + x = np.array(x, ndmin=1) + y = np.array(y, ndmin=1) + + # Test: inside domain? + # The -1 here are because of 0-based indices, and the -/+ 0.5 are + # because that's halfway to the next mass-staggered point and + # I treat the WRF grid as boxes. + x_in = [(-0.5 <= x_) and (x_ <= dom_size_x[n] - 1 + 0.5) for x_ in x] + y_in = [(-0.5 <= y_) and (y_ <= dom_size_y[n] - 1 + 0.5) for y_ in y] + + # Save domain, x and y at these locations + in_this_dom = np.where(np.logical_and(x_in, y_in))[0] + finest_domain[sel[in_this_dom]] = n + 1 + # If domain = 1, set _all_ xy to see where they end up + if n==0: + xy[sel, 0] = x + xy[sel, 1] = y + else: + xy[sel[in_this_dom], 0] = x[in_this_dom] + xy[sel[in_this_dom], 1] = y[in_this_dom] + + + # Return the indices and domain + return xy, finest_domain + + def get_groups_space_time(self, dat, time_bins, only_in=False): + """ + Returns a dictionary of lists of indices of observations in dat, + where keys are tuples of (time bin, x bin, y bin and wrf + domain), and values are indices of the observations that fall + within this bin. + + Arguments + --------- + + dat : list + List containing measurement locations: + date (same type as time_bins), latitude, longitude + time_bins : list + List of boundaries of time bins (I use datetime.datetime + objects) + only_in : boolean + If only_in is True, groups that fall outside the time_bins + and domain are not returned. + + """ + + # Time groups + id_t = np.array([bisect.bisect_right(time_bins, dat_date) + for dat_date in dat['date']], + int) + + # Spatial groups (indices and domain) + id_xy_f, dom = self.locate_domain(dat['latitude'], dat['longitude']) + + id_xy = np.round(id_xy_f).astype(int) + + + + # Version of only_in with sel: might be faster if sel is short - + # I don't know! But the results for my test case where identical + # (meaning 'domain' looked correct and identical) for both + # options for only_in. + + # if only_in: + # # Yes, 0<id_t is correct: in bisect_right, it means the + # # value is below the lowest sequence value. + # time_in = np.logical_and(0<id_t, id_t<len(time_bins)) + # space_in = dom != 0 + # sel = np.where(np.logical_and(time_in, space_in))[0] + # + # else: + # sel = range(len(id_t)) + # + # indices = get_index_groups(id_t[sel], id_xy[sel, 0], id_xy[sel, 1], dom[sel]) + # + # # Now have to account for sel again! + # if only_in: + # for k, v in indices.iteritems(): + # indices[k] = sel[v] + + # Version of only_in without sel: might be faster if sel is + # long - I don't know! But the results for my test case where + # identical (meaning 'domain' looked correct and identical) + + # Indices for all groups: + indices = utilities.get_index_groups(id_t, id_xy[:, 0], id_xy[:, 1], dom) + + # Throw the out-of-domain ones out here already + if only_in: + # Remove the index groups where domain is 0 (= outside of + # domain). Need to iterate over a list, because with an + # iterator, python complains that the dictionary changed + # size during iteration. + for k in list(indices.keys()): + if k[3] == 0: + del indices[k] + # Equivalent to the above 3 lines, don't know what's faster: + # groups = {k: v for k, v in groups.iteritems() if k[3] != 0} + + return indices + + + @staticmethod + def times_in_wrf_file(ncf): + """ + Returns the times in netCDF4.Dataset ncf as datetime object + """ + times_nc = ncf.variables["Times"] + times_chr = [] + for nt in range(times_nc.shape[0]): + # freum 2021-07-11: with the migration to python3, need to + # replace the string - conversion. Hope "utf-8" works + # always. + # times_chr.append(times_nc[nt, :].tostring()) + times_chr.append(str(times_nc[nt, :], "utf-8")) + + times_dtm = [dt.datetime.strptime(t_chr, "%Y-%m-%d_%H:%M:%S") + for t_chr in times_chr] + + return times_dtm + + def wrf_times(self, file_list): + """Read all times in a list of wrf files + + Output + ------ + - 1D-array containing all times + - 1D-array containing start indices of each file + """ + + times = list() + start_indices = np.ndarray((len(file_list), ), int) + for nf in range(len(file_list)): + ncf = nc.Dataset(file_list[nf]) + times_this = self.times_in_wrf_file(ncf) + start_indices[nf] = len(times) + times += times_this + ncf.close() + + return times, start_indices + + def open_wrf_location_files(self): + """ + Keep a description of a small wrf file for each domain in memory + to locate observations. + Appends _loc_file to self. + + Note: Should be edited out of the code. + """ + + ndomains = self.namelist["domains"]["max_dom"] + path = self.settings["run_dir"] + pattern = "wrfinput_d%02d" + self._loc_files = list() + for nd in range(1, ndomains+1): + fp = os.path.join(path, pattern % nd) + self._loc_files.append(nc.Dataset(fp, "r")) + + def close_wrf_location_files(self): + """See _open_wrf_location_files""" + for loc_file in self._loc_files: + loc_file.close() + + def wrf_filename_times(self, prefix): + """Get timestamps in wrf file names in run directory.""" + + # List all filenames + files = self.get_wrf_filenames(prefix + "*") + # Only use d01 files, pattern should be the same for all domains + pattern = os.path.join(self.settings["run_dir"], prefix) + files = [f for f in files if re.search(pattern, f)] + # Extract timestamp from filename + # Format is %Y-%m-%d_%H:%M:%S at the end of the filename + pattern_time = "%Y-%m-%d_%H:%M:%S" + len_tstamp = len(pattern_time) + 2 + times = [dt.datetime.strptime(f[-len_tstamp:], pattern_time) + for f in files] + + return times + + def get_wrf_filenames(self, glob_pattern): + """ + Gets the filenames in self.settings["run_dir"] that follow + glob_pattern, excluding those that end with + self.settings["original_save_suffix"] + """ + path = self.settings["run_dir"] + # All files... + wfiles = glob.glob(os.path.join(path, glob_pattern)) + # All originals + orig_suf = self.settings["original_save_suffix"] + opattern = glob_pattern + orig_suf + ofiles = glob.glob(os.path.join(path, opattern)) + + # All files except all originals: + files = [x for x in wfiles if x not in ofiles] + + # I need this sorted too often to not do it here. + files = np.sort(files).tolist() + return files + + + def sample_total_columns(self, dat, loc, fields_list): + """ + Sample total_columns of fields_list in WRF output in + self.settings["run_dir"] at the location id_xy in domain, id_t + in all wrfout-times. Files and indices therein are recognized + by id_t and file_time_start_indices. + All quantities needed for computing total columns from profiles + are in dat (kernel, prior, ...). + + Arguments + --------- + dat (:class:`list`) + Result of wrfhelper.read_sampling_coords. Used here: prior, + prior_profile, kernel, psurf, pressure_axis, [, pwf] + If psurf or any of pressure_axis are nan, wrf's own + surface pressure is used and pressure_axis constructed + from this and the number of levels in the averaging kernel. + This allows sampling with synthetic data that don't have + pressure information. This only works with level_def + "layer_average". + If pwf is not present or nan, a simple one is created, for + level_def "layer_average". + loc (:class:`dict`) + A dictionary with all location-related input for sampling, + computed in wrfout_sampler. Keys: + id_xy, domain: Domain coordinates + id_t: Timestep (continous throughout all files) + frac_t: Interpolation coeficient between id_t and id_t+1: + t_obs = frac_t*t[id_t] + (1-frac_t)*t[id_t+1]) + file_start_time_indices: Time index at which a new wrfout + file starts + files: names of wrfout files. + fields_list (:class:`list`) + The fields to sample total columns from. + + Output + ------ + sampled_columns (:class:`array`) + A 2D-array of sampled columns. + Shape: (len(dat["prior"]), len(fields_list)) + """ + + # Initialize output + tc = np.ndarray(shape=(len(dat["prior"]), len(fields_list)), dtype=float) + tc[:] = float("nan") + + # Process by domain + UD = list(set(loc["domain"])) + for dom in UD: + idd = np.nonzero(loc["domain"] == dom)[0] + # Process by id_t + UT = list(set(loc["id_t"][idd])) + for time_id in UT: + # Coordinates to process + idt = idd[np.nonzero(loc["id_t"][idd] == time_id)[0]] + # Get tracer ensemble profiles + profiles = self._read_and_intrp_v(loc, fields_list, time_id, idt) + # List, len=len(fields_list), shape of each: (len(idt),nz) + # Get pressure axis: + #paxis = self.read_and_intrp(wh_names, id_ts, frac_t, id_xy, "P_HYD")/1e2 # Pa -> hPa + psurf = self._read_and_intrp_v(loc, ["PSFC"], time_id, idt)[0]/1.e2 # Pa -> hPa + # Shape: (len(idt),) + ptop = float(self.namelist["domains"]["p_top_requested"])/1.e2 + # Shape: (len(idt),) + znw = self._read_and_intrp_v(loc, ["ZNW"], time_id, idt)[0] + #Shape:(len(idt),nz) + + # DONE reading from file. + # Here it starts to make sense to loop over individual observations + for nidt in range(len(idt)): + nobs = idt[nidt] + # Construct model pressure layer boundaries + pb_mod = self.get_pressure_boundaries_znw(znw[nidt, :], psurf[nidt], ptop) + + if (np.diff(pb_mod) >= 0).any(): + msg = ("Model pressure boundaries for observation %d " + \ + "are not monotonically decreasing! Investigate.") % nobs + raise ValueError(msg) + + # Construct retrieval pressure layer boundaries + if dat["level_def"][nobs] == "layer_average": + if np.any(np.isnan(dat["pressure_levels"][nobs])) \ + or np.isnan(dat["psurf"][nobs]): + # Code for synthetic data without a pressure axis, + # but with an averaging kernel: + # Use wrf's surface and top pressure + nlayers = len(dat["averaging_kernel"][nobs]) + pb_ret = np.linspace(psurf[nidt], ptop, nlayers+1) + else: + pb_ret = self.get_pressure_boundaries_paxis( + dat["pressure_levels"][nobs], + dat["psurf"][nobs]) + elif dat["level_def"][nobs] == "pressure_boundary": + if np.any(np.isnan(dat["pressure_levels"][nobs])): + # Code for synthetic data without a pressure axis, + # but with an averaging kernel: + # Use wrf's surface and top pressure + nlevels = len(dat["averaging_kernel"][nobs]) + pb_ret = np.linspace(psurf[nidt], ptop, nlevels) + else: + pb_ret = dat["pressure_levels"][nobs] + + if (np.diff(pb_ret) >= 0).any(): + msg = ("Retrieval pressure boundaries for " + \ + "observation %d are not monotonically " + \ + "decreasing! Investigate.") % nobs + raise ValueError(msg) + + # Get vertical integration coefficients (i.e. to + # "interpolate" from model to retrieval grid) + coef_matrix = self.get_int_coefs(pb_ret, pb_mod, dat["level_def"][nobs]) + + # Model retrieval with averaging kernel and prior profile + if "pressure_weighting_function" in list(dat.keys()): + pwf = dat["pressure_weighting_function"][nobs] + if (not "pressure_weighting_function" in list(dat.keys())) or np.any(np.isnan(pwf)): + # Construct pressure weighting function from + # pressure boundaries + pwf = self.get_pressure_weighting_function(pb_ret, rule="simple") + + # Compute pressure-weighted averaging kernel + avpw = pwf*dat["averaging_kernel"][nobs] + + # Get prior + prior_col = dat["prior"][nobs] + prior_profile = dat["prior_profile"][nobs] + if np.isnan(prior_col): # compute prior + prior_col = np.dot(pwf, prior_profile) + + # Compute total columns + for nf in range(len(fields_list)): + # Integrate model profile + profile_intrp = np.matmul(coef_matrix, profiles[nf][nidt, :]) + + # Model retrieval + tc[nobs, nf] = prior_col + np.dot(avpw, profile_intrp - prior_profile) + + # Test phase: save pb_ret, pb_mod, coef_matrix, + # one profile for manual checking + + # dat_save = dict(pb_ret=pb_ret, + # pb_mod=pb_mod, + # coef_matrix=coef_matrix, + # ens_profile=ens_profiles[0], + # profile_intrp=profile_intrp, + # id=dat.id) + # + #out = open("model_profile_%d.pkl"%dat.id, "w") + #cPickle.dump(dat_save, out, 0) + # Average over footprint + if self.settings["footprint_samples_dim"] > 1: + indices = utilities.get_index_groups(dat["sounding_id"]) + + # Make sure that this is correct: i know the number of indices + lens = [len(group) for group in list(indices.values())] + correct_len = self.settings["footprint_samples_dim"]**2 + if np.any([len_ != correct_len for len_ in set(lens)]): + raise ValueError("Not all footprints have %d samples" %correct_len) + # Ok, paranoid mode, also confirm that the indices are what I + # think they are: consecutive numbers + ranges = [np.ptp(group) for group in list(indices.values())] + if np.any([ptp != correct_len for ptp in set(ranges)]): + raise ValueError("Not all footprints have consecutive samples") + + tc_original = copy.deepcopy(tc) + tc = utilities.apply_by_group(np.average, tc_original, indices) + + return tc + + @staticmethod + def _read_and_intrp_v(loc, fields_list, time_id, idp): + """ + Helper function for sample_total_columns. + read_and_intrp, but vectorized. + Reads in fields and interpolates + them linearly in time. + + Arguments + ---------- + loc (:class:`dict`) + Passed through from sample_total_columns, see there. + fields_list (:class:`list` of :class:`str`) + List of netcdf-variables to process. + time_id (:class:`int`) + Time index referring to all files in loc to read + idp (:class:`array` of :class:`int`) + Indices for id_xy, domain and frac_t in loc (i.e. + observations) to process. + + Output + ------ + List of temporally interpolated fields, one entry per member of + fields_list. + """ + + var_intrp_l = list() + + # Check we were really called with observations for just one domain + domains = set(loc["domain"][idp]) + if len(domains) > 1: + raise ValueError("I can only operate on idp with identical domains.") + dom = domains.pop() + + # Select input files + id_file0 = bisect.bisect_right(loc["file_start_time_indices"][dom], time_id) - 1 + id_file1 = bisect.bisect_right(loc["file_start_time_indices"][dom], time_id+1) - 1 + if id_file0 < 0 or id_file1 < 0: + raise ValueError("This shouldn't happen.") + + # Get time id in file + id_t_file0 = time_id - loc["file_start_time_indices"][dom][id_file0] + id_t_file1 = time_id+1 - loc["file_start_time_indices"][dom][id_file1] + + # Open files + nc0 = nc.Dataset(loc["files"][dom][id_file0], "r") + nc1 = nc.Dataset(loc["files"][dom][id_file1], "r") + # Per field to sample + for field in fields_list: + # Read input file + field0 = wrf.getvar(wrfin=nc0, + varname=field, + timeidx=id_t_file0, + squeeze=False, + meta=False) + + field1 = wrf.getvar(wrfin=nc1, + varname=field, + timeidx=id_t_file1, + squeeze=False, + meta=False) + + if len(field0.shape) == 4: + # Sample field at timesteps before and after observation + # They are ordered nt x nz x ny x nx + # var0 will have shape (len(idp),len(profile)) + var0 = field0[0, :, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]] + var1 = field1[0, :, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]] + # Repeat frac_t for profile size + frac_t_ = np.array(loc["frac_t"][idp]).reshape((len(idp), 1)).repeat(var0.shape[1], 1) + elif len(field0.shape) == 3: + # var0 will have shape (len(idp),) + var0 = field0[0, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]] + var1 = field1[0, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]] + frac_t_ = np.array(loc["frac_t"][idp]) + elif len(field0.shape) == 2: + # var0 will have shape (len(idp),len(profile)) + # This is for ZNW, which is saved as (time_coordinate, + # vertical_coordinate) + var0 = field0[[0]*len(idp), :] + var1 = field1[[0]*len(idp), :] + frac_t_ = np.array(loc["frac_t"][idp]).reshape((len(idp), 1)).repeat(var0.shape[1], 1) + else: + raise ValueError("Can't deal with field with %d dimensions." % len(field0.shape)) + + # Interpolate in time + var_intrp_l.append(var0*frac_t_ + var1*(1. - frac_t_)) + + nc0.close() + nc1.close() + + return var_intrp_l + + @staticmethod + def read_sampling_coords(sampling_coords_file, id0=None, id1=None): + """Read in samples""" + + ncf = nc.Dataset(sampling_coords_file, "r") + if id0 is None: + id0 = 0 + if id1 is None: + id1 = len(ncf.dimensions['soundings']) + + dat = dict( + sounding_id=np.array(ncf.variables["sounding_id"][id0:id1]), + date=ncf.variables["date"][id0:id1], + latitude=np.array(ncf.variables["latitude"][id0:id1]), + longitude=np.array(ncf.variables["longitude"][id0:id1]), + latc_0=np.array(ncf.variables["latc_0"][id0:id1]), + latc_1=np.array(ncf.variables["latc_1"][id0:id1]), + latc_2=np.array(ncf.variables["latc_2"][id0:id1]), + latc_3=np.array(ncf.variables["latc_3"][id0:id1]), + lonc_0=np.array(ncf.variables["lonc_0"][id0:id1]), + lonc_1=np.array(ncf.variables["lonc_1"][id0:id1]), + lonc_2=np.array(ncf.variables["lonc_2"][id0:id1]), + lonc_3=np.array(ncf.variables["lonc_3"][id0:id1]), + prior=np.array(ncf.variables["prior"][id0:id1]), + prior_profile=np.array(ncf.variables["prior_profile"][id0:id1,]), + averaging_kernel=np.array(ncf.variables["averaging_kernel"][id0:id1]), + pressure_levels=np.array(ncf.variables["pressure_levels"][id0:id1]), + pressure_weighting_function=np.array(ncf.variables["pressure_weighting_function"][id0:id1]), + level_def=ncf.variables["level_def"][id0:id1], + psurf=np.array(ncf.variables["psurf"][id0:id1]) + ) + + ncf.close() + + # Convert level_def from it's weird nc format to string + dat["level_def"] = nc.chartostring(dat["level_def"]) + + # Convert date to datetime object + dat["time"] = [dt.datetime(*x) for x in dat["date"]] + + return dat + + @staticmethod + def write_simulated_columns(obs_id, simulated, nmembers, outfile): + """Write simulated observations to file.""" + + # Output format: see obs_xco2_fr + + f = io.CT_CDF(outfile, method="create") + + dimid = f.createDimension("sounding_id", size=None) + dimid = ("sounding_id",) + savedict = io.std_savedict.copy() + savedict["name"] = "sounding_id" + savedict["dtype"] = "int64" + savedict["long_name"] = "Unique_Dataset_observation_index_number" + savedict["units"] = "" + savedict["dims"] = dimid + savedict["comment"] = "Format as in input" + savedict["values"] = obs_id.tolist() + f.add_data(savedict, nsets=0) + + dimmember = f.createDimension("nmembers", size=nmembers) + dimmember = ("nmembers",) + savedict = io.std_savedict.copy() + savedict["name"] = "column_modeled" + savedict["dtype"] = "float" + savedict["long_name"] = "Simulated total column" + savedict["units"] = "??" + savedict["dims"] = dimid + dimmember + savedict["comment"] = "Simulated model value created by WRFChemOO" + savedict["values"] = simulated.tolist() + f.add_data(savedict, nsets=0) + + f.close() + + @staticmethod + def save_file_with_timestamp(file_path, out_dir, suffix=""): + """ Saves a file to with a timestamp""" + nowstamp = dt.datetime.now().strftime("_%Y-%m-%d_%H:%M:%S") + new_name = os.path.basename(file_path) + suffix + nowstamp + new_path = os.path.join(out_dir, new_name) + shutil.copy2(file_path, new_path) + + +if __name__ == "__main__": + pass diff --git a/da/tools/wrfchem/wrfout_sampler.py b/da/tools/wrfchem/wrfout_sampler.py new file mode 100644 index 00000000..03bc1fae --- /dev/null +++ b/da/tools/wrfchem/wrfout_sampler.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 22 15:07:13 2019 + +@author: friedemann +""" + +# Samples wrfchem history files (wrfout*) for CTDAS + +# This is called as external executable from CTDAS +# to allow simple parallelization +# +# Usage: + +# wrfout_sampler.py --arg1 val1 --arg2 val2 ... +# Arguments: See parser in code below + +import os +import sys +#import itertools +#import bisect +import copy +import numpy as np +import netCDF4 as nc + +# Import some CTDAS tools +pd = os.path.pardir +inc_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + pd, pd, pd) +inc_path = os.path.abspath(inc_path) +sys.path.append(inc_path) +from da.tools.wrfchem.wrfchem_helper import WRFChemHelper +from da.tools.wrfchem.utilities import utilities +import argparse + +########## Parse options +parser = argparse.ArgumentParser() +parser.add_argument("--nproc", type=int, + help="ID of this sampling process (0 ... nprocs-1)") +parser.add_argument("--nprocs", type=int, + help="Number of sampling processes") +parser.add_argument("--sampling_coords_file", type=str, + help="File with sampling coordinates as created " + \ + "by CTDAS column samples object") +parser.add_argument("--run_dir", type=str, + help="Directory with wrfout files") +parser.add_argument("--original_save_suffix", type=str, + help="Just leave this on .original") +parser.add_argument("--nmembers", type=int, + help="Number of tracer ensemble members") +parser.add_argument("--tracer_optim", type=str, + help="Tracer that was optimized (e.g. CO2 for " + \ + "ensemble members CO2_000 etc.)") +parser.add_argument("--outfile_prefix", type=str, + help="One process: output file. More processes: " + \ + "output file is <outfile_prefix>.<nproc>.slice") +parser.add_argument("--footprint_samples_dim", type=int, + help="Sample column footprint at n x n points") + +args = parser.parse_args() +settings = copy.deepcopy(vars(args)) + +# Start (stupid) logging - should be updated +wd = os.getcwd() +try: + os.makedirs("log") +except OSError: # Case when directory already exists. Will look nicer in python 3... + pass + +logfile = os.path.join(wd, "log/wrfout_sampler." + str(settings['nproc']) + ".log") + +os.system("touch " + logfile) +os.system("rm " + logfile) +os.system("echo 'Process " + str(settings['nproc']) + " of " + str(settings['nprocs']) + ": start' >> " + logfile) +os.system("date >> " + logfile) + +########## Initialize wrfhelper +wrfhelper = WRFChemHelper(settings) +wrfhelper.validate_settings(['sampling_coords_file', + 'run_dir', + 'nproc', + 'nprocs', + 'original_save_suffix', # necessary for selecting filename + 'nmembers', # special case 0: sample 'tracer_optim' + 'tracer_optim', + 'outfile_prefix', + 'footprint_samples_dim']) + +cwd = os.getcwd() +os.chdir(wrfhelper.settings['run_dir']) + +#wrfhelper.namelist = wrfhelper.read_namelist(wrfhelper.settings['run_dir']) +wrfhelper.namelist = wrfhelper.read_namelist(".") + + +########## Figure out which samples to process +# Get number of samples +ncf = nc.Dataset(settings['sampling_coords_file'], "r") +nsamples = len(ncf.dimensions['soundings']) +ncf.close() + +id0, id1 = utilities.get_slicing_ids(nsamples, settings['nproc'], settings['nprocs']) + +os.system("echo 'id0=" + str(id0) + "' >> " + logfile) +os.system("echo 'id1=" + str(id1) + "' >> " + logfile) + +########## Read samples from coord file +dat = wrfhelper.read_sampling_coords(settings['sampling_coords_file'], id0, id1) + +os.system("echo 'Data read, len=" + str(len(dat['sounding_id'])) + "' >> " + logfile) + + +########## Locate samples in wrf domains + +# Take care of special case without ensemble +nmembers = settings['nmembers'] +if nmembers == 0: + # Special case: sample 'tracer_optim', don't add member suffix + member_names = [settings['tracer_optim']] + nmembers = 1 +else: + member_names = [settings['tracer_optim'] + "_%03d" % nm for nm in range(nmembers)] + + +# Keep a description of a small wrf file for each domain in memory to +# locate observations. +# This concept is probably obsolete - doesn't save time, and +# locate_domain is parallelized anyway +wrfhelper.open_wrf_location_files() + +if settings["footprint_samples_dim"]==1: + # Locate all observations in space + # This function Wouldn't work for moving nests. + id_xy_f, domain = wrfhelper.locate_domain(dat['latitude'], dat['longitude']) + # Assume box averages and don't interpolate horizontally + id_xy = np.round(id_xy_f).astype(int) +else: + # Return the whole thing (needed in wrfhelper.sample_total_columns) + raise NotImplementedError("To do: wrfhelper.get_footprint_sampling_points") + dat_fs = wrfhelper.get_footprint_sampling_points(dat) + # Locate (free) + id_xy_f_free, domain_free = wrfhelper.locate_domain(dat_fs['latitude'], dat_fs['longitude']) + id_xy_free = np.round(id_xy_f_free).astype(int) + # Determine max domain + domain_fs = None + raise NotImplementedError("To do: domain_fs") + # Sample again with domain restriction - no need to return it again + id_xy_f, _ = wrfhelper.locate_domain(dat_fs['latitude'], dat_fs['longitude'], domain_fs) + id_xy = np.round(id_xy_f).astype(int) + # Thin out domain_fs to pass it to determination of id_t and frac_t below + domain = domain_fs[::settings["footprint_samples_dim"]] + +wrfhelper.close_wrf_location_files() + +# Locate in time: Which file, time index, and temporal interpolation +# factor. +# MAYBE make this a function. See which quantities I need later. +id_t = np.zeros_like(domain) +frac_t = np.ndarray(id_t.shape, float) +frac_t[:] = float("nan") +# Add a little flexibility by doing this per domain - namelists allow +# different output frequencies per domain. +wrfout_files = dict() +wrfout_times = dict() +wrfout_start_time_ids = dict() + +UD = list(set(domain)) +for dom in UD: + os.system("echo 'Processing domain " + str(dom) + "' >> " + logfile) + idd = np.where(domain == dom)[0] + # Get full time vector + wrfout_files[dom] = wrfhelper.get_wrf_filenames("wrfout_d%02d_*00" % dom) + wrfout_times[dom], wrfout_start_time_ids[dom] = wrfhelper.wrf_times(wrfout_files[dom]) + + # time id + for idd_ in idd: + # Look where it sorts in + tmp = [i + for i in range(len(wrfout_times[dom])-1) + if wrfout_times[dom][i] <= dat['time'][idd_] \ + and dat['time'][idd_] < wrfout_times[dom][i+1]] + # Catch the case that the observation took place exactly at the + # last timestep + if len(tmp) == 1: + id_t[idd_] = tmp[0] + time0 = wrfout_times[dom][id_t[idd_]] + time1 = wrfout_times[dom][id_t[idd_]+1] + frac_t[idd_] = (time1 - dat['time'][idd_]).total_seconds() / (time1 - time0).total_seconds() + else: # len must be 0 in this case + if len(tmp) > 1: + os.system("echo 'wat' >> " + logfile) + raise ValueError("wat") + if dat['time'][idd_] == wrfout_times[dom][-1]: + id_t[idd_] = len(wrfout_times[dom])-1 + frac_t[idd_] = 1 + else: + msg = "Sample %d, sounding_id %s: outside of simulated time."%(idd_, dat['sounding_id'][idd_]) + os.system("echo '" + msg + "' >> " + logfile) + raise ValueError(msg) + + +# Now read the data +# Input: id_xy, dom, id_t, wrfout_start_time_ids, fract_t +# Output: sampled columns +# All input related to location: +if settings["footprint_samples_dim"]>1: + domain = domain_fs + id_t = np.repeat(id_t, settings["footprint_samples_dim"]) + frac_t = np.repeat(frac_t, settings["footprint_samples_dim"]) + +loc_input = dict(id_xy=id_xy, domain=domain, + id_t=id_t, frac_t=frac_t, + files=wrfout_files, file_start_time_indices=wrfout_start_time_ids) + +ens_sim = wrfhelper.sample_total_columns(dat, loc_input, member_names) + +# Write results to file +obs_ids = dat['sounding_id'] +# Remove simulations that are nan (=not in domain) +if ens_sim.shape[0] > 0: + valid = np.apply_along_axis(lambda arr: not np.any(np.isnan(arr)), 1, ens_sim) + obs_ids_write = obs_ids[valid] + ens_sim_write = ens_sim[valid, :] +else: + obs_ids_write = obs_ids + ens_sim_write = ens_sim + +if settings['nprocs'] == 1: + outfile = settings['outfile_prefix'] +else: + # Create output files with the appendix ".<nproc>.slice" + # Format <nproc> so that they can later be easily sorted. + len_nproc = int(np.floor(np.log10(settings['nprocs']))) + 1 + outfile = settings['outfile_prefix'] + (".%0" + str(len_nproc) + "d.slice") % settings['nproc'] + +os.system("echo 'Writing output file '" + os.path.join(wrfhelper.settings['run_dir'], outfile) + " >> " + logfile) + +wrfhelper.write_simulated_columns(obs_id=obs_ids_write, + simulated=ens_sim_write, + nmembers=nmembers, + outfile=outfile) + +os.chdir(cwd) + +os.system("echo 'Done' >> " + logfile) -- GitLab