diff --git a/da/observations/obs_column_wrfchem.py b/da/observations/obs_column_wrfchem.py new file mode 100644 index 0000000000000000000000000000000000000000..38fea18ba1ea85e96bbc28dec10fde4b8bd9fa42 --- /dev/null +++ b/da/observations/obs_column_wrfchem.py @@ -0,0 +1,566 @@ +"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +updates of the code. See also: http://www.carbontracker.eu. + +This program is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software Foundation, +version 3. This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this +program. If not, see <http://www.gnu.org/licenses/>.""" +#!/usr/bin/env python +# obs.py + +""" +Author : liesbeth, freum + +Revision History: +File created June 2019. + + On this version (freum): +- Handles column observations. Based on obs_column_xco2.py from sometime mid 2019. +- Since then, both versions have developed further. Since this also affects the file +- format, I leave it as is for now for backwards compatibility for users of the +- original python 2 code. + +Most important revisions in this file from original: +- Added pressure weighting function +- Added optional footprint corner coordinates to improve sampling of models with + resolutions finer than the satellite footprint. This functionality is not validated. + +Changes to (original) obs_column_xco2.py not yet implemented here as of 2021-07-10: +- def __init__: New arguments h=0.0 (pressure weighting function), moderr=0.0, loc_L=0 +- Update to model-data-mismatch approach +- Approach to localization implemented here +- def add_observations(self, advance=False): new argument "advance" +- dimlayers +- self.additional_variables + +""" +import os +import sys +import logging + +import datetime as dtm +import numpy as np +#from string import strip +from numpy import array, logical_and, sqrt +sys.path.append(os.getcwd()) +sys.path.append('../../') + +identifier = 'CarbonTracker total column-averaged mole fractions' +version = '0.0' + +from da.observations.obs_baseclass import Observations +import da.tools.io4 as io +import da.tools.rc as rc + + +################### Begin Class TotalColumnSample ################### + +class TotalColumnSample(object): + """ + Holds the data that defines a total column sample in the data assimilation framework. Sor far, this includes all + attributes listed below in the __init__ method. One can additionally make more types of data, or make new + objects for specific projects. + This file may contain OCO-2 specific parts... + """ + + def __init__(self, idx, codex, xdate, obs=0.0, simulated=0.0, lat=-999., lon=-999., mdm=None, prior=0.0, prior_profile=0.0, av_kernel=0.0, pressure=0.0, \ + ##### freum vvvv + pressure_weighting_function=None, + ##### freum ^^^^ + level_def = "pressure_boundary", psurf = float('nan'), resid=0.0, hphr=0.0, flag=0, species='co2', sdev=0.0, \ + ##### freum vvvv + latc_0=None, latc_1=None, latc_2=None, latc_3=None, lonc_0=None, lonc_1=None, lonc_2=None, lonc_3=None \ + ##### freum ^^^^ + ): + self.id = idx # Sounding ID + self.code = codex # Retrieval ID + self.xdate = xdate # Date of obs + self.obs = obs # Value observed + self.simulated = simulated # Value simulated by model, fillvalue = -9999 + self.lat = lat # Sample lat + self.lon = lon # Sample lon + ##### freum vvvv + self.latc_0 = latc_0 # Sample latitude corner + self.latc_1 = latc_1 # Sample latitude corner + self.latc_2 = latc_2 # Sample latitude corner + self.latc_3 = latc_3 # Sample latitude corner + self.lonc_0 = lonc_0 # Sample longitude corner + self.lonc_1 = lonc_1 # Sample longitude corner + self.lonc_2 = lonc_2 # Sample longitude corner + self.lonc_3 = lonc_3 # Sample longitude corner + ##### freum ^^^^ + self.mdm = mdm # Model data mismatch + self.prior = prior # A priori column value used in retrieval + self.prior_profile = prior_profile # A priori profile used in retrieval + self.av_kernel = av_kernel # Averaging kernel + self.pressure = pressure # Pressure levels of retrieval + # freum vvvv + self.pressure_weighting_function = pressure_weighting_function # Pressure weighting function + # freum ^^^^ + self.level_def = level_def # Are prior and averaging kernel defined as layer averages? + self.psurf = psurf # Surface pressure (only needed if level_def is "layer_average") + self.loc_L = int(0) # freum 2021-07-13: insert this dummy value so the code runs with the current version of CTDAS. *Should* not affect results if localizetype == "CT2007" as in all my runs. However, replace this file with the standard observation file, obs_column_xco2.py + + self.resid = resid # Mole fraction residuals + self.hphr = hphr # Mole fraction prior uncertainty from fluxes and (HPH) and model data mismatch (R) + self.may_localize = True # Whether sample may be localized in optimizer + self.may_reject = True # Whether sample may be rejected if outside threshold + self.flag = flag # Flag + self.sdev = sdev # standard deviation of ensemble + self.species = species.strip() + + +################### End Class TotalColumnSample ################### + + +################### Begin Class TotalColumnObservations ################### + +class TotalColumnObservations(Observations): + """ An object that holds data + methods and attributes needed to manipulate column samples + """ + + def setup(self, dacycle): + + self.startdate = dacycle['time.sample.start'] + self.enddate = dacycle['time.sample.end'] + + # Path to the input data (daily files) + sat_files = dacycle.dasystem['obs.column.ncfile'].split(',') + sat_dirs = dacycle.dasystem['obs.column.input.dir'].split(',') + + self.sat_dirs = [] + self.sat_files = [] + for i in range(len(sat_dirs)): + if not os.path.exists(sat_dirs[i].strip()): + msg = 'Could not find the required satellite input directory (%s) ' % sat_dirs[i] + logging.error(msg) + raise IOError(msg) + else: + self.sat_dirs.append(sat_dirs[i].strip()) + self.sat_files.append(sat_files[i].strip()) + del i + + # Get observation selection criteria (if present): + if 'obs.column.selection.variables' in dacycle.dasystem.keys() and 'obs.column.selection.criteria' in dacycle.dasystem.keys(): + self.selection_vars = dacycle.dasystem['obs.column.selection.variables'].split(',') + self.selection_criteria = dacycle.dasystem['obs.column.selection.criteria'].split(',') + logging.debug('Data selection criteria found: %s, %s' %(self.selection_vars, self.selection_criteria)) + else: + self.selection_vars = [] + self.selection_criteria = [] + logging.info('No data observation selection criteria found, using all observations in file.') + + # Model data mismatch approach + self.mdm_calculation = dacycle.dasystem.get('mdm.calculation') + logging.debug('mdm.calculation = %s' %self.mdm_calculation) + if not self.mdm_calculation in ['parametrization','empirical','no_transport_error']: + logging.warning('No valid model data mismatch method found. Valid options are \'parametrization\' and \'empirical\'. ' + \ + 'Using a constant estimate for the model uncertainty of 1ppm everywhere.') + else: + logging.info('Model data mismatch approach = %s' %self.mdm_calculation) + + # Path to file with observation error settings for column observations + # Currently the same settings for all assimilated retrieval products: should this be one file per product? + if not os.path.exists(dacycle.dasystem['obs.column.rc']): + msg = 'Could not find the required column observation .rc input file (%s) ' % dacycle.dasystem['obs.column.rc'] + logging.error(msg) + raise IOError(msg) + else: + self.obs_file = (dacycle.dasystem['obs.column.rc']) + + self.datalist = [] + + # Switch to indicate whether simulated column samples are read from obsOperator output, + # or whether the sampling is done within CTDAS (in obsOperator class) + self.sample_in_ctdas = dacycle.dasystem['sample.in.ctdas'] if 'sample.in.ctdas' in dacycle.dasystem.keys() else False + logging.debug('sample.in.ctdas = %s' % self.sample_in_ctdas) + + + + def get_samples_type(self): + return 'column' + + + + def add_observations(self): + """ Reading of total column observations, and selection of observations that will be sampled and assimilated. + + """ + + # Read observations from daily input files + for i in range(len(self.sat_dirs)): + + logging.info('Reading observations from %s' %os.path.join(self.sat_dirs[i],self.sat_files[i])) + + infile0 = os.path.join(self.sat_dirs[i], self.sat_files[i]) + ndays = 0 + + while self.startdate+dtm.timedelta(days=ndays) <= self.enddate: + + infile = infile0.replace("<YYYYMMDD>",(self.startdate+dtm.timedelta(days=ndays)).strftime("%Y%m%d")) + + if os.path.exists(infile): + + logging.info("Reading observations for %s" % (self.startdate+dtm.timedelta(days=ndays)).strftime("%Y%m%d")) + len_init = len(self.datalist) + + # get index of observations that satisfy selection criteria (based on variable names and values in system rc file, if present) + ncf = io.ct_read(infile, 'read') + if self.selection_vars: + selvars = [] + for j in self.selection_vars: + selvars.append(ncf.get_variable(j.strip())) + del j + criteria = [] + for j in range(len(self.selection_vars)): + criteria.append(eval('selvars[j]'+self.selection_criteria[j])) + del j + #criteria = [eval('selvars[i]'+self.selection_criteria[i]) for i in range(len(self.selection_vars))] + subselect = np.logical_and.reduce(criteria).nonzero()[0] + else: + subselect = np.arange(ncf.get_variable('sounding_id').size) + + # retrieval attributes + code = ncf.get_attribute('retrieval_id') + level_def = ncf.get_attribute('level_def') + + # only read good quality observations + ids = ncf.get_variable('sounding_id').take(subselect, axis=0) + lats = ncf.get_variable('latitude').take(subselect, axis=0) + lons = ncf.get_variable('longitude').take(subselect, axis=0) + obs = ncf.get_variable('obs').take(subselect, axis=0) + unc = ncf.get_variable('uncertainty').take(subselect, axis=0) + dates = ncf.get_variable('date').take(subselect, axis=0) + dates = array([dtm.datetime(*d) for d in dates]) + av_kernel = ncf.get_variable('averaging_kernel').take(subselect, axis=0) + prior_profile = ncf.get_variable('prior_profile').take(subselect, axis=0) + pressure = ncf.get_variable('pressure_levels').take(subselect, axis=0) + + prior = ncf.get_variable('prior').take(subselect, axis=0) + + ##### freum vvvv + pwf = ncf.get_variable('pressure_weighting_function').take(subselect, axis=0) + + # Additional variable surface pressure in case the profiles are defined as layer averages + if level_def == "layer_average": + psurf = ncf.get_variable('surface_pressure').take(subselect, axis=0) + else: + psurf = [float('nan')]*len(ids) + + # Optional: footprint corners + latc = dict( + latc_0=[float('nan')]*len(ids), + latc_1=[float('nan')]*len(ids), + latc_2=[float('nan')]*len(ids), + latc_3=[float('nan')]*len(ids)) + lonc = dict( + lonc_0=[float('nan')]*len(ids), + lonc_1=[float('nan')]*len(ids), + lonc_2=[float('nan')]*len(ids), + lonc_3=[float('nan')]*len(ids)) + # If one footprint corner variable is there, assume + # all are there. That's the only case that makes sense + if 'latc_0' in list(ncf.variables.keys()): + latc['latc_0'] = ncf.get_variable('latc_0').take(subselect, axis=0) + latc['latc_1'] = ncf.get_variable('latc_1').take(subselect, axis=0) + latc['latc_2'] = ncf.get_variable('latc_2').take(subselect, axis=0) + latc['latc_3'] = ncf.get_variable('latc_3').take(subselect, axis=0) + lonc['lonc_0'] = ncf.get_variable('lonc_0').take(subselect, axis=0) + lonc['lonc_1'] = ncf.get_variable('lonc_1').take(subselect, axis=0) + lonc['lonc_2'] = ncf.get_variable('lonc_2').take(subselect, axis=0) + lonc['lonc_3'] = ncf.get_variable('lonc_3').take(subselect, axis=0) + ###### freum ^^^^ + + ncf.close() + + # Add samples to datalist + # Note that the mdm is initialized here equal to the measurement uncertainty. This value is used in add_model_data_mismatch to calculate the mdm including model error + for n in range(len(ids)): + # Check for every sounding if time is between start and end time (relevant for first and last days of window) + if self.startdate <= dates[n] <= self.enddate: + self.datalist.append(TotalColumnSample(ids[n], code, dates[n], obs[n], None, lats[n], lons[n], unc[n], prior[n], prior_profile[n,:], \ + av_kernel=av_kernel[n,:], pressure=pressure[n,:], pressure_weighting_function=pwf[n,:],level_def=level_def,psurf=psurf[n], + ##### freum vvvv + latc_0=latc['latc_0'][n], latc_1=latc['latc_1'][n], latc_2=latc['latc_2'][n], latc_3=latc['latc_3'][n], + lonc_0=lonc['lonc_0'][n], lonc_1=lonc['lonc_1'][n], lonc_2=lonc['lonc_2'][n], lonc_3=lonc['lonc_3'][n] + ##### freum ^^^^ + )) + + logging.debug("Added %d observations to the Data list" % (len(self.datalist)-len_init)) + + ndays += 1 + + del i + + if len(self.datalist) > 0: + logging.info("Observations list now holds %d values" % len(self.datalist)) + else: + logging.info("No observations found for sampling window") + + + + def add_model_data_mismatch(self, filename=None, advance=False): + """ This function is empty: model data mismatch calculation is done during sampling in observation operator (TM5) to enhance computational efficiency + (i.e. to prevent reading all soundings twice and writing large additional files) + + """ + obs_data = rc.read(self.obs_file) + self.rejection_threshold = int(obs_data['obs.rejection.threshold']) + + # At this point mdm is set to the measurement uncertainty only, added in the add_observations function. + # Here this value is used to set the combined mdm by adding an estimate for the model uncertainty as a sum of squares. + if len(self.datalist) <= 1: return #== 0: return + for obs in self.datalist: + # parametrization as used by Frederic Chevallier + if self.mdm_calculation == 'parametrization': + obs.mdm = ( obs.mdm*obs.mdm + (0.8*np.exp((90.0+obs.lat)/300.0))**2 )**0.5 + # empirical approach of Andy Jacobson, TO BE IMPLEMENTED + # elif self.mdm_calculation == 'empirical': + # obs.mdm = ... + elif self.mdm_calculation == 'no_transport_error': + pass + else: # assume general model uncertainty of 1 ppm (arbitrary value) + obs.mdm = ( obs.mdm*obs.mdm + 1**2 )**0.5 + del obs + + meanmdm = np.average(np.array( [obs.mdm for obs in self.datalist] )) + logging.debug('Mean MDM = %s' %meanmdm) + + + + def add_simulations(self, filename, silent=False): + """ Adds observed and model simulated column values to the mole fraction objects + This function includes the add_observations and add_model_data_mismatch functionality for the sake of computational efficiency + + """ + + if self.sample_in_ctdas: + logging.debug("CODE TO ADD SIMULATED SAMPLES TO DATALIST TO BE ADDED") + + else: + # read simulated samples from file + if not os.path.exists(filename): + msg = "Sample output filename for observations could not be found : %s" % filename + logging.error(msg) + logging.error("Did the sampling step succeed?") + logging.error("...exiting") + raise IOError(msg) + + ncf = io.ct_read(filename, method='read') + ids = ncf.get_variable('sounding_id') + simulated = ncf.get_variable('column_modeled') + ncf.close() + logging.info("Successfully read data from model sample file (%s)" % filename) + + obs_ids = self.getvalues('id').tolist() + + missing_samples = [] + + # Match read simulated samples with observations in datalist + logging.info("Adding %i simulated samples to the data list..." % len(ids)) + for i in range(len(ids)): + # Assume samples are in same order in both datalist and file with simulated samples... + if ids[i] == obs_ids[i]: + self.datalist[i].simulated = simulated[i] + # If not, find index of current sample + elif ids[i] in obs_ids: + index = obs_ids.index(ids[i]) + # Only add simulated value to datalist if sample has not been filled before. Otherwise: exiting + if self.datalist[index].simulated is not None: + msg = 'Simulated and observed samples not in same order, and duplicate sample IDs found.' + logging.error(msg) + raise IOError(msg) + else: + self.datalist[index].simulated = simulated[i] + else: + logging.debug('added %s to missing_samples, obs id = %s' %(ids[i],obs_ids[i])) + missing_samples.append(ids[i]) + del i + + if not silent and missing_samples != []: + logging.warning('%i Model samples were found that did not match any ID in the observation list. Skipping them...' % len(missing_samples)) + + # if number of simulated samples < observations: remove observations without samples + if len(simulated) < len(self.datalist): + test = len(self.datalist) - len(simulated) + logging.warning('%i Observations were not sampled, removing them from datalist...' % test) + for index in reversed(list(range(len(self.datalist)))): + if self.datalist[index].simulated is None: + del self.datalist[index] + del index + + logging.debug("%d simulated values were added to the data list" % (len(ids) - len(missing_samples))) + + + + def write_sample_coords(self, obsinputfile): + """ + Write empty sample_coords_file if soundings are present in time interval, just such that general pipeline code does not have to be changed... + """ + + if self.sample_in_ctdas: + return + + if len(self.datalist) <= 1: #== 0: + logging.info("No observations found for this time period, no obs file written") + return + + # write data required by observation operator for sampling to file + f = io.CT_CDF(obsinputfile, method='create') + logging.debug('Creating new observations file for ObservationOperator (%s)' % obsinputfile) + + dimsoundings = f.add_dim('soundings', len(self.datalist)) + dimdate = f.add_dim('epoch_dimension', 7) + dimchar = f.add_dim('char', 20) + if len(self.datalist) == 1: + dimlevels = f.add_dim('levels', len(self.getvalues('pressure'))) + # freum: inserted but commented Liesbeth's new code for layers for reference, + # but I handle them differently. + # if len(self.getvalues('av_kernel')) != len(self.getvalues('pressure')): + # dimlayers = f.add_dim('layers',len(self.getvalues('av_kernel'))) + # layers = True + # else: layers = False + else: + dimlevels = f.add_dim('levels', self.getvalues('pressure').shape[1]) + # if self.getvalues('av_kernel').shape[1] != self.getvalues('pressure').shape[1]: + # dimlayers = f.add_dim('layers', self.getvalues('pressure').shape[1] - 1) + # layers = True + # else: layers = False + + savedict = io.std_savedict.copy() + savedict['dtype'] = "int64" + savedict['name'] = "sounding_id" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('id').tolist() + f.add_data(savedict) + + data = [[d.year, d.month, d.day, d.hour, d.minute, d.second, d.microsecond] for d in self.getvalues('xdate') ] + savedict = io.std_savedict.copy() + savedict['dtype'] = "int" + savedict['name'] = "date" + savedict['dims'] = dimsoundings + dimdate + savedict['values'] = data + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "latitude" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lat').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "longitude" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lon').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "prior" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('prior').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "prior_profile" + savedict['dims'] = dimsoundings + dimlevels + savedict['values'] = self.getvalues('prior_profile').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "averaging_kernel" + savedict['dims'] = dimsoundings + dimlevels + savedict['values'] = self.getvalues('av_kernel').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "pressure_levels" + savedict['dims'] = dimsoundings + dimlevels + savedict['values'] = self.getvalues('pressure').tolist() + f.add_data(savedict) + + # freum vvvv + savedict = io.std_savedict.copy() + savedict['name'] = "pressure_weighting_function" + savedict['dims'] = dimsoundings + dimlevels + savedict['values'] = self.getvalues('pressure_weighting_function').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "latc_0" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('latc_0').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "latc_1" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('latc_1').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "latc_2" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('latc_2').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "latc_3" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('latc_3').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "lonc_0" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lonc_0').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "lonc_1" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lonc_1').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "lonc_2" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lonc_2').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "lonc_3" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('lonc_3').tolist() + f.add_data(savedict) + + # freum ^^^^ + + savedict = io.std_savedict.copy() + savedict['dtype'] = "char" + savedict['name'] = "level_def" + savedict['dims'] = dimsoundings + dimchar + savedict['values'] = self.getvalues('level_def').tolist() + f.add_data(savedict) + + savedict = io.std_savedict.copy() + savedict['name'] = "psurf" + savedict['dims'] = dimsoundings + savedict['values'] = self.getvalues('psurf').tolist() + f.add_data(savedict) + + f.close() + + + + + +################### End Class TotalColumnObservations ################### + + +if __name__ == "__main__": + pass diff --git a/da/obsoperators/observationoperator_wrfchem.py b/da/obsoperators/observationoperator_wrfchem.py new file mode 100755 index 0000000000000000000000000000000000000000..a2e9bc2fe9ee0ab453d0fd8cea3d644d50394ae1 --- /dev/null +++ b/da/obsoperators/observationoperator_wrfchem.py @@ -0,0 +1,1400 @@ +"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +updates of the code. See also: http://www.carbontracker.eu. + +This program is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software Foundation, +version 3. This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this +program. If not, see <http://www.gnu.org/licenses/>. + +Author : Friedemann Reum + +Revision History: +File created on 26 March 2019. + +This module holds functions needed to use the WRF-Chem model within CTDAS. It +is based on the TM5 observation operator by Wouter Peters. + +The input is an rc file that defines a source directory and a run directory. +The source directory is a directory in which wrf.exe can be called +(e.g. WRFV3/run/). I.e., it assumes that WRF-Chem was compiled, all +preprocessing was done and the outputs linked to the run +directory, and the correct namelist.input is there. namelist.input will +be modified in the run directory (e.g. start- and end times adapted to +each cycle). +""" + +# Instructions for pylint (e.g. for code analysis in Spyder): +# pylint: disable=too-many-instance-attributes +# pylint: disable=W0201 +# pylint: disable=C0301 + +# Python libraries +import os +import logging +import shutil +import subprocess +import glob +import re +import datetime as dtm +import numpy as np +import netCDF4 as nc +import f90nml + +# CTDAS modules +import da.tools.rc as rc +from da.tools.general import create_dirs +from da.obsoperators.observationoperator_baseclass import ObservationOperator +from da.tools.wrfchem.wrfchem_helper import WRFChemHelper +from da.tools.wrfchem.utilities import utilities + +identifier = "wrfchem" +version = "1.0" + +class WRFChemOO(ObservationOperator): + """This class holds methods and variables that are needed to run the WRF-Chem model.""" + + ########## Public methods + + def __init__(self, rc_filename, dacycle=None): + """Initialize WRFChemOO using settings in rc_filename.""" + + # Identify yourself + self.ID = identifier + self.version = version + + # Load settings + self._load_rc(rc_filename) + self._validate_rc() + + # A hardcoded setting (doesnt make sense to allow user to change + # it) + self.settings["original_save_suffix"] = ".original" + + # Instantiate a WRFChemHelper object + self.wrfhelper = WRFChemHelper(self.settings) + self.wrfhelper.validate_settings(["run_dir"]) + + logging.info("Observation Operator initialized: %s (%s)", + self.ID, self.version) + + + def setup(self, dacycle): + """Prepare the observation operator for running. + + This includes reading settings and preparing the run directory. + """ + + # Attach the CycleControl object (used for exchanging + # information between classes) + self.dacycle = dacycle + + # Infer whether initial and boundary conditions will be + # optimized. Only a constant offset can be optimized, + # hence the name. + self.dacycle.dasystem["do_offset"] = \ + "sigma_offset" in list(self.dacycle.dasystem.keys()) + + # Set up the directory where WRF is to be run. + # Only do this if you are the first data assimilation cycle. The + # WRF run directory is reused in subsequent runs to save both + # space and time. + if self.dacycle["time.restart"] is False: + logging.info("This is the first data assimilation cycle.") + + # Not overwriting existing files, so that if a run is + # started again (e.g. when it fails), the existing files can + # be used. This makes sense because setting up the directory + # can take a long time because WRF input files for ensemble + # runs are large. + overwrite = False + + self._create_fresh_run_dir(overwrite) + + logging.info("Fresh run dir for WRFChem created: %s", + self.settings["run_dir"]) + + # Read WRF namelist (= configuration file) + self.namelist = self.wrfhelper.read_namelist(self.settings["run_dir"]) + + # Also tell wrfhelper about the namelist. + self.wrfhelper.namelist = self.namelist + + # Some setup tasks only for the first data assimilation cycle. + if self.dacycle["time.restart"] is False: + # Prepare fluxes files for modification by optimizer + self._prepare_flux_files(overwrite) + + # In case initial/boundary conditions are optimized, save + # original wrfinput and wrfbdy files. They serve as + # sources for prior ic/bc. + if self.dacycle.dasystem["do_offset"]: + wrfinput = self.wrfhelper.get_wrf_filenames("wrfinput_d*") + wrfbdy = self.wrfhelper.get_wrf_filenames("wrfbdy_d01") + self._save_original_files(wrfinput + wrfbdy, overwrite) + logging.info("Original initial- and boundary " + \ + "conditions files saved.") + + logging.info("Setup done.") + + + def get_initial_data(self): + """Nothing to do, all preparations moved to _prepare_run""" + pass + + + def run_forecast_model(self, samples=None): + """Run the transport model and sample simulated observations""" + + self._prepare_run(samples) + self._validate_input() + self._run() + self._sample_simulated_obs(samples) + self._save_data() + + + def write_members_to_file(self, lag, outdir, statevector):#,endswith=".nc"): + """Write ensemble member information to file for WRFChem + + This writes fluxes in wrfchemi* files for one lag. This is based + on the statevector object, which is passed as argument, the + regionsfile and the namelist (for dealing with nests). + + In case an offset is optimized, wrfinput and wrfbdy files are + also modified accordingly. + + "outdir" is not used. + """ + + # Some abbreviations: + + # Data to write + members = statevector.ensemble_members[lag] + add_flux_fix = "flux_fix" in list(self.settings.keys()) + # Number of flux parameters per flux process. + # freum 2021-07-11: Use statevector.nparams_proc. This here + # would have been wrong if an offset was included and multiple + # emission processes used. + # nparams = int(self.dacycle.dasystem["nparameters"]) + # nparams_proc = nparams/int(self.dacycle.dasystem["n_emis_proc"]) + nparams_proc = statevector.nparams_proc + # Number of flux processes + n_emis_proc = int(self.dacycle.dasystem["n_emis_proc"]) + # WRF configuration + run_dir = self.settings["run_dir"] + n_domains = self.namelist["domains"]["max_dom"] + nmember_format = "%03d" + + + # Get member parameter maps for each domain + param_map_mem_dom = list() + for nproc in range(n_emis_proc): + # New parameter map list for this flux process + param_map_mem_dom.append(list()) + for mem in members: + # Get full parameter map for this member + param_map_mem = statevector.vector2grid( + mem.param_values[(nparams_proc*nproc):(nparams_proc*(nproc+1))]) + # Get domain-specific parameter maps from full map + nm = mem.membernumber + param_map_mem_dom[nproc].append(list()) + for domain in range(1, n_domains+1): + param_map_mem_dom[nproc][nm].append(self._sample_domain_map(param_map_mem, domain)) + + # Write fluxes for each domain: + + # Times that correspond to this lag + time_lag_start = self.dacycle["time.sample.start"] + time_lag_end = self.dacycle["time.sample.end"] + + for domain in range(1, n_domains+1): # WRF domain numbers start at 1 + + # Get the start times of wrfchemi files that belong to this lag + # First, get all times + file_times = self.wrfhelper.wrf_filename_times("wrfchemi_d%02d_"%domain) + file_times.sort() + # Then filter for the current lag. + file_times_lag = [file_times[i] for i in range(len(file_times)) + if + file_times[i] >= time_lag_start + and file_times[i] < time_lag_end + ] + + # Loop over flux files + for file_time in file_times_lag: + # Get file path + fn = "".join(["wrfchemi_d%02d_" % domain, + file_time.strftime("%Y-%m-%d_%H:%M:%S")]) + fp = os.path.join(run_dir, fn) + + # Path to original file + ofp = fp + self.settings["original_save_suffix"] + + # Which times in file to modify? + # Warning: I always use flux files with only one time + # index, so this part isn't tested. + + # First, get all times + src = nc.Dataset(ofp, "r") + flux_times = self.wrfhelper.times_in_wrf_file(src) + # Then filter for the current lag + indices = [i for i in range(len(flux_times)) + if + flux_times[i] >= time_lag_start + and flux_times[i] < time_lag_end + ] + # And finally make indices for slicing + # This assumes that flux times are monotonically + # increasing, so check that. + if np.any(np.diff(indices) != 1): + raise ValueError("Times in flux file " + \ + "'%s' " % ofp + \ + "are not monotonically increasing") + i0 = indices[0] + i1 = indices[-1] + 1 + + logging.debug("Writing fluxes to file %s, " % fp + \ + "time index slice %d:%d" %(i0, i1)) + + # Open destination file + dst = nc.Dataset(fp, "r+") + + # Read prior fluxes + prefix = self.settings["flux_optim"] + src_field = list() + for nproc in range(n_emis_proc): + if n_emis_proc==1: + src_varname = prefix + else: + src_varname = prefix + "_proc_" + str(nproc+1) + src_var = src.variables[src_varname] + src_field.append(src_var[i0:i1, :, :, :]) + + # Read fixed fluxes (if required) + if add_flux_fix: + fix_var = src.variables[self.settings["flux_fix"]] + fix_field = fix_var[i0:i1, :, :, :] + + # Write fluxes for every ensemble member + for mem in members: + + # Test: + # For testing the orienation of param_map_domain and + # src_field, make the bottom right corner 1. + # param_map_domain[0, -1] = 1 + # src_field[0, 0, 0, -1] = 1 # This was before n_emis_proc + + # Multiply prior fluxes with statevector map + field_state = np.zeros_like(src_field[0]) + nm = mem.membernumber + for nproc in range(n_emis_proc): + for nt in range(field_state.shape[0]): + for nz in range(field_state.shape[1]): + field_state[nt, nz, :, :] = ( + field_state[nt, nz, :, :] + + src_field[nproc][nt, nz, :, :] + * param_map_mem_dom[nproc][nm][domain-1][:] + ) + + # For testing whether the domain map is correct: + # field_state[nt, nz, :, :] = param_map_domain[:] + + # Add fixed fluxes + if add_flux_fix: + field_state += fix_field + + # Write to file handle (written to file when + # dst.close() is called) + varname = prefix + "_" + nmember_format % nm + dst.variables[varname][i0:i1, :, :, :] = field_state[:] + + src.close() + dst.close() + + # For debugging, copy dst dataset + #path = os.path.join(self.dacycle["dir.da_run"], "output") + #time_lag_start = self.dacycle["time.sample.start"] + #timestamp = time_lag_start.strftime("_lag%Y%m%d") + #self.wrfhelper.save_file_with_timestamp(fp, path, timestamp) + + # Add offset to wrfinput and wrfbdy + # Note: Everything but a constant offset would impact _BTXE + # and so on in wrfbdy. So only implementing a constant offset. + if self.dacycle.dasystem["do_offset"]: + if self.dacycle["time.restart"]: + logging.warning( + "do_offset = True and time.restart = True. Is " + \ + "this an advance step? If not, you are " + \ + "attempting to optimize initial and boundary " + \ + "conditions in a run with more than one cycle. " + \ + "This would produce an inconsistent result, " + \ + "because initial conditions also impact " + \ + "observations in previous cycles, but this " + \ + "isn't accounted for. Skipping." + ) + else: + # Write wrfbdy + fp = os.path.join(self.settings["run_dir"], "wrfbdy_d01") + ofp = fp + self.settings["original_save_suffix"] + src = nc.Dataset(ofp, "r") + dst = nc.Dataset(fp, "r+") + quantities = ["BXS", "BXE", "BYS", "BYE"] + for mem in members: + offset_value = mem.param_values[-1] + var_names = [self.settings["tracer_optim"] + \ + "_" + nmember_format % mem.membernumber + \ + "_" + q for q in quantities] + for var_name in var_names: + var_src = src.variables[var_name][:] + dst.variables[var_name][:] = var_src + offset_value + src.close() + dst.close() + + # Write wrfinput + for domain in range(1, n_domains+1): + fp = os.path.join(self.settings["run_dir"], + "wrfinput_d%02d" % domain) + ofp = fp + self.settings["original_save_suffix"] + src = nc.Dataset(ofp, "r") + dst = nc.Dataset(fp, "r+") + for mem in members: + offset_value = mem.param_values[-1] + var_name = self.settings["tracer_optim"] + \ + "_" + nmember_format % mem.membernumber + var_src = src.variables[var_name][:] + dst.variables[var_name][:] = var_src + offset_value + src.close() + dst.close() + + logging.info("write_members_to_file done.") + + + ########## Private methods + + def _prepare_run(self, samples): + """Prepare a forward model WRF-Chem run. + + - Update the WRF's configuration file (namelist.input) + - Define file name where sampled simulated observations + are stored. + """ + + # Update parameters for new run (i.e., start and stop time) + self._update_namelist_file() + + # Define file name of sampled simulated observations + # Store pattern for finding the right file in write_simulated*, + # because they don't know the integer indices + self._sim_fpattern = "wrf_sampled_%%s_%s.nc" % self.dacycle["time.sample.stamp"] + + # Also construct full filename because pipeline relies on + # integer indices + self.simulated_file = [None] * len(samples) + for i, sample in enumerate(samples): + self.simulated_file[i] = os.path.join( + self.settings["run_dir"], + self._sim_fpattern % sample.get_samples_type()) + + # Prepare the track input file - NOT USED + #self._prepare_wrfinput_track() + + + + def _update_namelist_file(self): + """Update namelist.input file based on dacycle object + + Updated values: + - start time, end time + - restart timestep + - restart flag + + Note: format of namelist.input is slightly different after + running this: + - 19, 19, -> 19, 19 + - 00 -> 0 + - 2. -> 2.0 + - "wrfchemi_d<domain>_<date>" -> "wrfchemi_d<domain>_<date>" + This could be adjusted in f90nml settings, but it doesn't + seem to matter to WRF. + """ + + # Set start and end time in namelist + self._set_namelist_time("start_", self.dacycle["time.sample.start"]) + self._set_namelist_time("end_", self.dacycle["time.sample.end"]) + + # Set restart interval + # Compute it (as integer) + intrvl_timedelta = (self.dacycle["time.sample.end"] + - self.dacycle["time.sample.start"]) + intrvl_mins = intrvl_timedelta.total_seconds()/60. + if np.abs(intrvl_mins-np.floor(intrvl_mins)) > np.finfo(float).eps: + msg = "Cannot safely cast restart_interval in WRF " + \ + "namelist to integer. Without this casting, " + \ + "restart-files might not be created. Stopping." + raise ValueError(msg) + # Write to namelist + self.namelist["time_control"]["restart_interval"] = int(intrvl_mins) + + # Set restart flag + # If this is a restart from a previous cycle, or a previous lag, + # the WRF model should start from restart files + # (time.sample.window is also used by TM5 observation operator to set + # the restart flag, so I guess this is the right value to use...) + if self.dacycle["time.restart"] or self.dacycle["time.sample.window"] != 0: + logging.info("Configuring WRF-Chem to perform restart.") + self.namelist["time_control"]["restart"] = True + else: + # Note: In case you ever want to start the first cycle with + # restart files, change this branch to "pass" + + logging.info("Configuring WRF-Chem to perform cold start.") + + self.namelist["time_control"]["restart"] = False + + # Write namelist changes to file + fp = os.path.join(self.settings["run_dir"], "namelist.input") + fp_original = fp + self.settings["original_save_suffix"] + f90nml.patch(fp_original, self.namelist, fp) + + def _set_namelist_time(self, prefix, date): + """Insert datetime object into namelist dictionary.""" + + if not isinstance(date, dtm.datetime): + logging.error("set_namelist_time: argument date must be a " + \ + "datetime.datetime- object") + + # Incidentally, names of datetime components WRF namelists are + # identical to those in datetime objects. Therefore, we can set + # them by iterating over their names. + time_components = ["year", "month", "day", "hour", "minute", "second"] + for time_component in time_components: + self.namelist["time_control"][prefix + time_component] = \ + self._repeat_per_domain(getattr(date, time_component)) + + + def _repeat_per_domain(self, value): + """Output: list of 'value' repeated max_dom times.""" + + n_domains = self.namelist["domains"]["max_dom"] + return [value] * n_domains + + + # def _prepare_wrfinput_track(self): + # """Prepares wrfinput_track.txt for this run + # + # Original version of track_driver can't deal + # with times outside of simulation period. So + # I'm cutting the original file down here. + # STATUS: NOT USED + # """ + # + # run_dir = self.settings["run_dir"] + # fp = os.path.join(run_dir, "wrfinput_track.txt") + # fp_original = fp + self.settings["original_save_suffix"] + # + # if not os.path.exists(fp_original): + # return + # + # # Read original track + # # Read all as str: have to parse times, and will keep format of + # # locations + # dat = np.loadtxt(fp_original, dtype=str) + # + # # Select lines that are in current lag + # times = [dtm.datetime.strptime(t_chr, "%Y-%m-%d_%H:%M:%S") for t_chr in dat[:, 0]] + # + # time_lag_start = self.dacycle["time.sample.start"] + # time_lag_end = self.dacycle["time.sample.end"] + # + # # Write track file for this run + # # Have to exclude 1st and include last timestep here because that's the + # # way track_driver needs it! + # # This is unlike the other cases (fluxes and wrfout), where + # # I include the start and exclude the end of the lag + # f = open(fp, "w") + # for n in range(dat.shape[0]): + # if times[n] > time_lag_start and times[n] <= time_lag_end: + # f.write(" ".join(dat[n, :]) + "\n") + # f.close() + + + def _load_rc(self, name): + """Read settings from the observation operator's rc-file + + Based on TM5ObservationOperator.load_rc + """ + + self.rcfile = rc.RcFile(name) + self.settings = self.rcfile.values + self.rc_filename = name + + logging.debug("rc-file %s loaded", name) + + def _validate_rc(self): + """Check that some required values are given in the rc-file. + + Based on TM5ObservationOperator.validate_rc + """ + + needed_rc_items = ["source_dir", "run_dir"] + + for key in needed_rc_items: + if key not in self.settings: + msg = "Missing a required value in rc-file : %s" % key + logging.error(msg) + raise IOError(msg) + logging.debug("rc-file has been validated succesfully") + + def _create_fresh_run_dir(self, overwrite): + """ Set up the directory where WRF will run. + + Steps: + - Create target_directory + - From self.settings["source_dir"] to target_directory, + - copy files that need to be modified during runtime + - link files that need not be modified + - Useful to set overwrite=False for resubmitting a + failed run. Note: namelist.input is _always_ copied + regardless of the value of overwrite. + """ + + # Create target_dir + target_dir = self.settings["run_dir"] + create_dirs(target_dir) + + # By default, files are linked from source directory to + # target_dir. Define here which files need to be copied + # instead of linked, i.e., those that need to be modified + # During runtime. + # They are described by patterns that are compared to file + # names via re.match(pattern, filename) + copy_pattern_list = ["^namelist\.input$", + "^wrfchemi_.*", + # "^wrfinput_track\.txt$" # NOT USED + ] + + if self.dacycle.dasystem["do_offset"]: + copy_pattern_list.append("wrfinput_d.*") + copy_pattern_list.append("wrfbdy_d.*") + + # Since I don't know concise code to check for more than one + # pattern without a loop, create one single pattern for all. + copy_full_pattern = "(" + "|".join(copy_pattern_list) + ")" + + # Skip some files + skip_pattern_list = ["^wrfout.*", + "^wrfrst.*", + "namelist\.output", + ".*~$", + "log", + "^rsl.error.*", + "^rsl.out.*", + "sampled_.*", + "met_em.*" + ] + skip_full_pattern = "(" + "|".join(skip_pattern_list) + ")" + + # Get list of all files in source directory + source_dir = self.settings["source_dir"] + source_files = os.listdir(source_dir) + + # Copy/link files to target_dir + logging.info("Populating WRF run directory. This may take a while.") + for filename in source_files: + # Do nothing for files to skip + if re.match(skip_full_pattern, filename): + continue + + # Copy/link files + fp_source = os.path.join(source_dir, filename) + fp_destination = os.path.join(target_dir, filename) + # Skip if the file is already there and overwrite is false + if (os.path.isfile(fp_destination)) and (not overwrite): + continue + if re.match(copy_full_pattern, filename): + # Copy files to copy (with overwrite) + shutil.copy2(fp_source, target_dir) + else: + # Link files to link (with overwrite) + if os.path.exists(fp_destination): + os.remove(fp_destination) + os.symlink(fp_source, fp_destination) + + # create a log dir + create_dirs(os.path.join(target_dir, "log")) + + # Save original namelist.input - ignore overwrite. + # Note: this is useful in case you restart CTDAS with an updated namelist. + fp_source = os.path.join(source_dir, "namelist.input") + shutil.copy2(fp_source, target_dir) + self._save_original_files([os.path.join(target_dir, "namelist.input")], True) + + # # Save original wrfinput_track.txt (if it exists) - NOT USED + # fp_source = os.path.join(source_dir, "wrfinput_track.txt") + # if os.path.exists(fp_source): + # shutil.copy2(fp_source, target_dir) + # self._save_original_files([os.path.join(target_dir, "wrfinput_track.txt")], True) + + + def _prepare_flux_files(self, overwrite=True): + """Prepare flux input files for WRF-Chem + + - Save original flux files + - Create fields for ensemble members if not yet present + + I want to get rid of saving original files, and the only + practical way to do the latter is in preprocessing because + here it's not parallelized. So either parallelize or remove. + Could replace with _validate_emissions. + """ + + # Flux files for this inversion period + all_files = self.wrfhelper.get_wrf_filenames("wrfchemi*00") + + ti = self.dacycle["time.start"] + days_plus = int(self.dacycle["time.nlag"])*int(self.dacycle["time.cycle"]) + te = self.dacycle["time.finish"] + dtm.timedelta(days_plus) + logging.warning("Processing until: te = " + te.isoformat() + ". Too far?") + pattern_time = "%Y-%m-%d_%H:%M:%S" + len_tstamp = len(pattern_time) + 2 + all_times = [dtm.datetime.strptime(f[-len_tstamp:], pattern_time) + for f in all_files] + wrfchemi_files = [all_files[nf] for nf in range(len(all_files)) + if ti <= all_times[nf] and all_times[nf] <= te] + + # Rename chemistry input files to wrfchemi*.original + self._save_original_files(wrfchemi_files, overwrite) + logging.info("Original flux files saved.") + + # Write placeholder fields for optimized fluxes into wrfchemi files + # -> This takes a lot of time for a large ensemble because it's done + # serially. The only practical way is to do this in preprocessing, + # so that should become the only allowed way. + logging.info("Creating flux fields for ensemble in flux files.") + prefix = self.settings["flux_optim"] + self._create_member_fields(wrfchemi_files, prefix, "", "zeros", False) + logging.info("Flux files prepared.") + + + def _save_original_files(self, files, overwrite=True): + """ Make a copy of files with a predefined suffix + + For development, add the option to skip (because it + takes a while). + files = list + """ + + suffix = self.settings["original_save_suffix"] + for file_ in files: + new_filename = file_ + suffix + if (os.path.isfile(new_filename)) and (not overwrite): + logging.debug("File %s exists, skipping", new_filename) + continue + shutil.copy2(file_, new_filename) + logging.debug("Copied %s to %s", file_, new_filename) + + + def _create_member_fields(self, files, prefix, suffix="", + mode="zeros", overwrite=True): + """ Create fields for each ensemble member for the original prefix+suffix. + + Since this takes a long time, I add the option to not overwrite in + case the fields are already there, i.e. created in preprocessing. + This is actually the only practical way at the moment. Either edit + this function out or parallelize. + + This assumes that original files were previously saved with + _self._save_original_files(). + + mode = "zeros": All zeros (will be used for emissions) + mode = "copy": Copy original (was for boundary- and initial + conditions, but that's not done anymore.) + files = list + Attributes are copied in any case. The member number is added to the + "description" attribute. + + """ + + # For each domain, and each file, write a field of 0's for each + # ensemble member (<prefix>_000, ...) + # Netcdf operations from https://stackoverflow.com/questions/15141563/python-netcdf-making-a-copy-of-all-variables-and-attributes-but-one + + # Specify directory to operate in + run_dir = self.settings["run_dir"] + # suffix for original + orig_suf = self.settings["original_save_suffix"] + # Number of ensemble members + nmembers = int(self.dacycle["da.optimizer.nmembers"]) + # Format of ensemble members + nmember_format = "%03d" + # Name of emission fields to copy = prefix+suffix + var_orig = prefix+suffix + # Create arrays of 0s for complete emission field in all files + have_src = False + for f in files: + # File path + fp = os.path.join(run_dir, f) + # Get variable that you want to emulate from original file + ofp = fp + orig_suf + src = nc.Dataset(ofp, "r") + src_var = src.variables[var_orig] + if mode == "zeros": + if not have_src: + # Only create this once + field_to_write = np.zeros(src_var.shape, dtype=src_var.dtype) + have_src=True + elif mode == "copy": + field_to_write = src_var[:] # always need new source + else: + raise ValueError("Unknown mode: %s" % mode) + + # Open netcdf handle in new file + dst = nc.Dataset(fp, "r+") + + for n in range(nmembers): + nmember_formatted = nmember_format % n + # field name for this member + fn = "".join([prefix, "_", nmember_formatted, suffix]) + # Skip if it should not be overwritten + if (fn in list(dst.variables.keys())) and (not overwrite): + continue + # Create 0-field for this member + dst.createVariable(fn, src_var.dtype, src_var.dimensions) + # Write the data + dst.variables[fn][:] = field_to_write + # Copy variable attributes all at once via dictionary + dst.variables[fn].setncatts(src_var.__dict__) + # Add ensemble member number to description of variable + dst.variables[fn].setncattr( + "description", + dst.variables[fn].getncattr("description") + \ + " ensemble_member_" + nmember_formatted) + src.close() + dst.close() + + + def _validate_input(self): + """Validate that WRF has all the inputs it needs to run + + Mostly only checks existence of files, not their content. + """ + + # In all cases: Validate parameters and emissions. + self._validate_parameters() + #self._validate_emissions() # NOT USED + + # Validate different inputs needed for cold start and restart + if self.namelist["time_control"]["restart"]: + logging.info("Restart run: Validate for restart files,") + self._validate_restart_files() + else: + logging.info("Cold start: Validate initial and boundary " + + "conditions.") + self._validate_boundary_conditions() + self._validate_initial_conditions() + + + + def _validate_parameters(self): + """Checks namelist.input and number of allocated tasks + + namelist.input (WRF's configuration file) can contain + instructions on how many tasks to use. This function + checks whether namelist.input itself exists, and the + number of allocated tasks is compatible with values + in the namelist. + """ + + # Check if WRF configuration file exists (kinda moot, it's + # been read in before... + self._confirm_file_exists("namelist.input") + + # Confirm that the allocated number of tasks are enough + # in case numbers of tasks were explicitly given in the + # namelist. + self._validate_ntasks() + + + def _nprocs_namelist(self): + """Compute how many tasks namelist.input requests""" + + + + # Check variables in namelist that specify the number of tasks + got_x = "nproc_x" in list(self.namelist["domains"].keys()) + got_y = "nproc_y" in list(self.namelist["domains"].keys()) + + got_quilt = "namelist_quilt" in list(self.namelist.keys()) + if got_quilt: + got_iog = "nio_groups" in list(self.namelist["namelist_quilt"].keys()) + got_iot = "nio_tasks_per_group" in list(self.namelist["namelist_quilt"].keys()) + else: + got_iog = False + got_iot = False + + # Compute requested number of compute processes + if got_x or got_y: + if got_x != got_y: + logging.error("nproc_x xor nproc_y specified in WRF " + \ + "namelist, don't know how to deal with that.") + + nproc_x = int(self.namelist["domains"]["nproc_x"]) + nproc_y = int(self.namelist["domains"]["nproc_y"]) + ntasks_compute = nproc_x*nproc_y + else: + ntasks_compute = 0 + + # Compute requested number of IO processes + if got_iog or got_iot: + if got_iog != got_iot: + logging.error("nio_groups xor nio_tasks_per_group " + \ + "specified in WRF namelist, don't " + \ + "know how to deal with that.") + nio_groups = int(self.namelist["namelist_quilt"]["nio_groups"]) + nio_tasks_per_group = int(self.namelist["namelist_quilt"]["nio_tasks_per_group"]) + ntasks_io = nio_groups*nio_tasks_per_group + else: + ntasks_io = 0 + + # The number of requested process is the sum of requested + # compute and io tasks + nprocs = ntasks_compute + ntasks_io + return nprocs + + def _validate_ntasks(self): + """ Confirm that the allocated number of tasks are enough + + In case numbers of tasks were explicitly given in the namelist. + """ + + # Number of tasks as specified in CycleControl (comes from + # rc-file passed to CTDAS) + ntasks_cc = int(self.dacycle["da.resources.ntasks"]) + + # Number of processes requested by namelist parameters + ntasks_namelist = self._nprocs_namelist() + + # Compare the two. + if ntasks_namelist > ntasks_cc: + logging.error("WRF namelist requests more processes " + \ + "than available in the job.") + else: + logging.info("Number of allocated tasks OK for WRF namelist.") + + + def _validate_emissions(self): + """Does nothing + + Should check if emissions files exist for this period. But + this would require diving into how emissions are saved (e.g. + per day, hour, ...), so I'm omitting it. If files are missing, + the run will crash later. + """ + + pass + + def _validate_restart_files(self): + """Does nothing + + Should check whether the restart files for this run exist, + but filenames depend on io_form_restart. + """ + + #ndomains = self.namelist["domains"]["max_dom"] + #time_format = self.dacycle["time.sample.start"].strftime("%Y-%m-%d_%H:%M:%S") + #for n_d in range(1, ndomains + 1): + # restart_file = "wrfrst_d{domain:02d}_".format(domain=n_d) + time_format + # self._confirm_file_exists(restart_file) + + pass + + + def _validate_boundary_conditions(self): + """Check if boundary condition file exists""" + + # Check if boundary conditions file exist + self._confirm_file_exists("wrfbdy_d01") + + def _validate_initial_conditions(self): + """Check if initial conditions files exist""" + + ndomains = self.namelist["domains"]["max_dom"] + for n_d in range(1, ndomains + 1): + input_file = "wrfinput_d{:02d}".format(n_d) + self._confirm_file_exists(input_file) + + def _confirm_file_exists(self, filename): + """Throw an error if <filename> is not present in run_dir""" + + filepath = os.path.join(self.settings["run_dir"], filename) + if not os.path.exists(filepath): + msg = "File %s not found." % filename + logging.error(msg) + raise OSError(msg) + + logging.debug("File %s found.", filename) + + + def _run(self): + """Run the WRF model by submitting wrf.exe to slurm. + + Sources on how to do this: + - https://stackoverflow.com/questions/32594734/slurm-multiprocessing-python-job + - https://pythonspot.com/python-subprocess/ + - submit_tm5_tools.py + + It would be cleaner to replace "run_command" and so on with + calls to the platform module. However, running TM5 is handled + by submit_tm5 and submit_tm5_tools.py instead. On the machine + where the model was developed, only full nodes can be reserved + anyway. So I copy this behavior. + """ + + # Option to run only if you haven't yet for this period + # Useful for debegging everything that comes after running + # the transport model. + + if "no_rerun_wrf" in list(self.settings.keys()): + if self.settings["no_rerun_wrf"].lower() in ["true", "t", "1"]: + if self._has_wrfout(): + logging.info("no_rerun_wrf = True and transport " + \ + "model output already present for " + \ + "run period. Therefore, not rerunning " + \ + "model and using existing output.") + return + elif self.settings["no_rerun_wrf"].lower() in ["false", "f", "0"]: + pass + else: + msg = "Don't understand value " + \ + "'%s' " % self.settings["no_rerun_wrf"] + \ + "for rc-option 'no_rerun_wrf'." + logging.error(msg) + raise ValueError(msg) + + run_dir = self.settings["run_dir"] + + # Change to wrf run directory + cwd = os.getcwd() + os.chdir(run_dir) + + # Build command + # Basic mpirun command given in rc-file + command_ = self.settings["run_command"] + # Get number of tasks + ntasks = int(self.dacycle["da.resources.ntasks"]) + if self.settings["run_ntask_opt"]: + command_ += " " + self.settings["run_ntask_opt"] + str(ntasks) + # Request a runtime. Not that important, actually, it's just a job step. + if self.settings["run_t_opt"]: + command_ += " " + self.settings["run_t_opt"] + self.settings["run_t"] + # Add executable and redirection of output. + command = command_ + " ./wrf.exe >& log/wrf.log" + + # Run WRF model + logging.info("Starting WRF model with command: %s" %command) + p = subprocess.Popen(command.split(), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + # Wait for process to finish (may not be needed) + p.wait() + logging.info("WRF model run complete.") + + # Check output + retcode = utilities.check_out_err(p) + if retcode != 0: + raise OSError("WRF returned errors. Check logs in %s" % run_dir) + + # Switch back to previous working directory + os.chdir(cwd) + + def _has_wrfout(self): + """Check if there are wrfout files for start and end of this lag + + Purpose: Option to not rerun the transport model but go straight + to sampling wrfout files, used during development of the sampler. + """ + + #pylint: disable=unreachable + + run_dir = self.settings["run_dir"] + + t1 = self.dacycle["time.sample.start"] + t2 = self.dacycle["time.sample.end"] + + fpattern = "wrfout_d01_%Y-%m-%d_%H:%M:%S" + fnames = [t.strftime(fpattern) for t in [t1, t2]] + fpaths = [os.path.join(run_dir, fn) for fn in fnames] + files_exist = list(map(os.path.exists, fpaths)) + + return all(files_exist) + + + def _sample_simulated_obs(self, samples): + """Sample simulated observations + + Calls functions that sample 4D modeled mixing ratio fields + at the times and locations of observations. + """ + + for sample in samples: + sample_type = sample.get_samples_type() + if sample_type == "column": + logging.info("Starting _launch_wrfout_column_sampling") + self._launch_wrfout_column_sampling(sample) + logging.info("Finished _launch_wrfout_column_sampling") + elif sample_type == "point": + self._launch_wrfout_point_sampling(sample) + else: + logging.error("Unknown sample type: %s", + sample.get_samples_type()) + + + def _launch_wrfout_point_sampling(self, samples): + """Raises an error that says it hasn't been implemented. + + Should sample WRF output at coordinates of point observations. + Not implemented yet. + """ + + msg = "Point sampling not implemented." + logging.error(msg) + raise NotImplementedError(msg) + + + def _launch_wrfout_column_sampling(self, sample): + """Sample WRF output at coordinates of column observations.""" + + run_dir = self.settings["run_dir"] + + sampling_coords_file = os.path.join( + self.dacycle["dir.input"], + sample.get_samples_type() + \ + "_coordinates_%s.nc" % self.dacycle["time.sample.stamp"]) + + # Reconstruct self.simulated_file[i] + out_file = self._sim_fpattern % sample.get_samples_type() + + # Remove intermediate files from a previous sampling job (might + # still be there if that one fails) + # The file pattern is hardcoded in wrfout_sampler + slicefile_pattern = out_file + ".*.slice" + for f in glob.glob(os.path.join(run_dir, slicefile_pattern)): + os.remove(f) + + # Spawn multiple wrfchem_sampler instances, + # using at most all processes available + nprocs1 = int(self.dacycle["da.resources.ntasks"]) + + # Might not want to use that many processes if there are few + # observations, because of overhead. Set a minimum number of + # observations per process, and reduce the number of + # processes to hit that. + Nobs = len(sample.datalist) + if Nobs == 0: + logging.info("No observations, skipping sampling") + return + + # Might want to increase this, no idea if this is reasonable + nobs_min = 100 + nprocs2 = max(1, int(float(Nobs)/float(nobs_min))) + + # Number of processes to use: + nprocs = min(nprocs1, nprocs2) + + # Make run command + # For a task with 1 processor, specifically request -N1 because + # otherwise slurm apparently sometimes tries to allocate one task to + # more than one node. Or something like that. See here: + # https://stackoverflow.com/questions/24056961/running-slurm-script-with-multiple-nodes-launch-job-steps-with-1-task + command_ = self.settings["run_command_1task"] + + # Specifically request memory per process (otherwise the first + # one would clog the node and you end up with serial execution) + if self.settings["run_mem_opt"]: + max_nprocs_per_node = min(int(self.settings["cpus_per_node"]), nprocs) + mb_per_process = np.floor(int(self.settings["mem_per_node"])*1024. / float(1+max_nprocs_per_node)) # The +1 is to leave a bit spare memory + mem_str = str(int(mb_per_process)) + self.settings["mem_unit_str_MB"] + command_ += " " + self.settings["run_mem_opt"] + mem_str + + # Request a runtime. Not that important, actually, it's just a + # job step. + if self.settings["run_t_opt"]: + command_ = command_ \ + + " " + self.settings["run_t_opt"] \ + + " " + self.settings["run_t"] + + # Check if output slice files are already present + # This shouldn't happen, because they are deleted + # a few lines above. But if for some reason (crash) + # they are still here, this might lead to funny behavios. + if nprocs > 1: + output_files = glob.glob(slicefile_pattern) + if len(output_files) > 0: + msg = "Files that match the pattern of the " + \ + "sampler output are already present. Stopping." + logging.error(msg) + raise OSError(msg) + + # Submit processes + procs = list() + for nproc in range(nprocs): + cmd = " ".join([ + command_, + "python ./da/tools/wrfchem/wrfout_sampler.py", + "--nproc %d" % nproc, + "--nprocs %d" % nprocs, + "--sampling_coords_file %s" % sampling_coords_file, + "--run_dir %s" % run_dir, + "--original_save_suffix %s" % self.settings["original_save_suffix"], + "--nmembers %d" % int(self.dacycle["da.optimizer.nmembers"]), + "--tracer_optim %s" % self.settings["tracer_optim"], + "--outfile_prefix %s" % out_file, + "--footprint_samples_dim %d" % int(self.settings['obs.column.footprint_samples_dim']) + ]) + + procs.append(subprocess.Popen(cmd.split(), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT)) + + logging.info("Started %d sampling process(es).", nprocs) + logging.debug("Command of last process: %s", cmd) + + # Wait for all processes to finish + for n in range(nprocs): + procs[n].wait() + + # Check for errors + retcodes = [] + for n in range(nprocs): + logging.debug("Checking errors in process %d", n) + retcodes.append(utilities.check_out_err(procs[n])) + + if any([r != 0 for r in retcodes]): + raise RuntimeError("At least one sampling process " + \ + "finished with errors.") + + logging.info("All sampling processes finished.") + + # Join output files + logging.info("Joining output files.") + if nprocs > 1: + utilities.cat_ncfiles(run_dir, slicefile_pattern, + "sounding_id", out_file, + in_pattern=True) + + logging.info("WRF column output sampled.") + logging.info("If samples object carried observations, output " + \ + "file written to %s", self.simulated_file) + + + def _save_data(self): + """Saves data from this lag + + This doesn't *have* to do anything. WRF is always started from + the same directory and takes care of the restarts itself, so no + files have to be collected (which is what happens here for TM5). + However, if "save_wrf_step" is set to True in the rc-file, then + all outputs and the namelist are copied to a subfolder for + debugging. + """ + + # For compatibility: + self.restart_filelist = [] + self.output_filelist = [] + + # + + # Relevant wrf files that change each time wrf is run + if "save_wrf_step" in list(self.settings.keys()): + if self.settings["save_wrf_step"].lower() in ["true", "t", "1"]: + # Keep some files + time_lag_start = self.dacycle["time.sample.start"] + timestamp = time_lag_start.strftime("_lag%Y%m%d") + + path = os.path.join(self.settings["run_dir"], "wrf"+timestamp) + logging.info("Saving wrf files to " + path) + + if not os.path.exists(path): + os.makedirs(path) + + # NOT USED + # wrfout_track + # pattern = os.path.join(self.settings["run_dir"], + # "wrfout_track_*") + # files = glob.glob(pattern) + # for f in files: + # self.wrfhelper.save_file_with_timestamp(f, path, timestamp) + + # Prepare timesteps + ti = self.dacycle["time.sample.start"] + te = self.dacycle["time.sample.end"] + pattern_time = "%Y-%m-%d_%H:%M:%S" + len_tstamp = len(pattern_time) + 2 + + # List wrfout files + file_pattern = "wrfout*" + all_files = self.wrfhelper.get_wrf_filenames(file_pattern) + all_times = [dtm.datetime.strptime(f[-len_tstamp:], pattern_time) + for f in all_files] + + # Save wrfout files that belong to this lag + for nf in range(len(all_files)): + if ti <= all_times[nf] and all_times[nf] <= te: + self.wrfhelper.save_file_with_timestamp(all_files[nf], path, timestamp) + + # Restart files + file_pattern = "wrfrst*" + all_files = self.wrfhelper.get_wrf_filenames(file_pattern) + all_times = [dtm.datetime.strptime(f[-len_tstamp:], pattern_time) + for f in all_files] + for nf in range(len(all_files)): + if all_times[nf] == te: + self.wrfhelper.save_file_with_timestamp(all_files[nf], path, timestamp) + + # rsl files + file_pattern = "rsl.*" + all_files = self.wrfhelper.get_wrf_filenames(file_pattern) + for nf in range(len(all_files)): + self.wrfhelper.save_file_with_timestamp(all_files[nf], path, timestamp) + + # Namelist + in_path = os.path.join(self.settings["run_dir"], "namelist.input") + self.wrfhelper.save_file_with_timestamp(in_path, path, timestamp) + + + # def _save_track_output(self): + # """ + # Save output from track_driver by adding a timestamp. + # STATUS: NOT USED + # """ + # run_dir = self.settings["run_dir"] + # ndomains = self.namelist["domains"]["max_dom"] + # time_lag_start = self.dacycle["time.sample.start"] + # timestamp = time_lag_start.strftime("%Y-%m-%d_%H:%M:%S") + # # for testing purposes, also add system time + # nowstamp = dtm.datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + # timestamp += "_"+nowstamp + # + # for domain in range(1, ndomains+1): + # fn = "wrfout_track_d%02d" % domain + # fp = os.path.join(run_dir, fn) + # if os.path.exists(fp): + # shutil.copy2(fp, fp + "_" + timestamp) + + def _sample_domain_map(self, full_map, domain): + """Sample domain map + + Takes as input a <full_map> with values for domain d01 in + resolution of the finest domain (= format of regions file, + output of vector2grid) and returns values for extent and + resolution of <domain>. + """ + + # Start indices (0-based) + start1 = self._start_in_1(full_map, domain) + start = [s1-1 for s1 in start1] + if any([s<0 for s in start]): + raise ValueError("Something is wrong with start indices") + + # Step sizes + strides = self._strides(full_map, domain) + + # Stop indices: add domain size to start + stop = start[:] + dom = self.namelist["domains"] + # -1 here to have #cells not #edges + stop[0] = start[0] + (dom["e_we"][domain-1]-1)*strides[0] + stop[1] = start[1] + (dom["e_sn"][domain-1]-1)*strides[1] + + # Now sample + # Sampling indices as sequence + xseq = list(range(start[0], stop[0], strides[0])) + yseq = list(range(start[1], stop[1], strides[1])) + # Sampling indices as grid + xx, yy = np.meshgrid(xseq, yseq) + + # Sample: ATTENTION: axis 0 refers to the latitude! + sampled_map = full_map[yy, xx] + + return sampled_map + + def _start_in_1(self, full_map, domain): + """Start index of domain in full_map, 1-based""" + domains = self._parent_domains(domain) + domains.append(domain) + istart = 1 # i refers to x (as e_we) + jstart = 1 # j refers to y (as e_sn) + for ndom in range(len(domains)-1): + this_domain = domains[ndom+1] + this_parent = domains[ndom] + istart += (self.namelist["domains"]["i_parent_start"][this_domain-1]-1) \ + *self._strides(full_map, this_parent)[0] + jstart += (self.namelist["domains"]["j_parent_start"][this_domain-1]-1) \ + *self._strides(full_map, this_parent)[1] + return [istart, jstart] + + def _strides(self, full_map, domain): + """Step size in <full_map> for <domain> + + I.e. the product of the parent grid ratios between + finest domain and <domain> + """ + dom = self.namelist["domains"] + # On which resolution is full_map? stride of domain 1: + # (Attention: index 1 refers to e_we, index 0 refers to e_sn) + stride1_x = full_map.shape[1] / (dom["e_we"][0]-1) + stride1_y = full_map.shape[0] / (dom["e_sn"][0]-1) + # stride = product of parent grid ratios + if domain == 1: + ratio_dom = 1 + else: + domains = self._parent_domains(domain) + domains.append(domain) + ratio_dom = np.array(dom["parent_grid_ratio"])[np.array(domains)-1].prod() + # round() converts to int more safely. Still assumes the + # namelist and regions file are compatible, if not, this + # might return wrong results. + stride_x = int(round(stride1_x/ratio_dom)) + stride_y = int(round(stride1_y/ratio_dom)) + + return (stride_x, stride_y) + + def _parent_domains(self, domain): + """Figure out the domain parents of this domain.""" + + if domain == 1: + # No parents + return [] + else: + # Add parent domain to list + # "-1" because domain numbers start at 1 + parent = self.namelist["domains"]["parent_id"][domain-1] + # Add parents of parent to list + parents_of_parent = self._parent_domains(parent) + parents_of_parent.append(parent) + # Done + return parents_of_parent + + + # Obsolete: + #def _get_ntasks_max(self): + # """Get the number of tasks that can be spawned from this process.""" + # os_environ_ntask_var = self.settings["os_environ_ntask_var"] + # if os_environ_ntask_var in os.environ.keys(): + # # I believe 1 process is taken up by this one, so -1 + # ntasks = int(os.environ[os_environ_ntask_var]) - 1 + # else: + # ntasks = int(self.settings["ntasks_max_default"]) + # msg = "%s not detected in " % os_environ_ntask_var + \ + # "environment variables. Setting ntasks=%d" % ntasks + # logging.warning(msg) + # + # return ntasks + + +if __name__ == "__main__": + pass diff --git a/da/pipelines/pipeline_wrfchem.py b/da/pipelines/pipeline_wrfchem.py new file mode 100755 index 0000000000000000000000000000000000000000..2cb6a2d5ac81028b84a735245f701e465c04a26c --- /dev/null +++ b/da/pipelines/pipeline_wrfchem.py @@ -0,0 +1,469 @@ +"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +updates of the code. See also: http://www.carbontracker.eu. + +This program is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software Foundation, +version 3. This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this +program. If not, see <http://www.gnu.org/licenses/>.""" +#!/usr/bin/env python +# pipeline.py + +""" +.. module:: pipeline +.. moduleauthor:: Wouter Peters + +Revision History: + +2021-07-10 freum Version for wrfchem +This is the wrfchem version, which was started in 2019 and does not yet include +all changes that were made to the original pipeline since then. They can probably +be integrated easily, so that wrfchem can use the standard pipeline. +Differences to original: +- Everything related to "inversion.savestate.dir" is not included here +- Instead of "if sample.get_samples_type() == 'flask': ", samples includes an + empty function write_sample_auxiliary + +File created on 06 Sep 2010. + +The pipeline module holds methods that execute consecutive tasks with each of the objects of the DA system. + +""" +import logging +import os +import sys +import datetime +import copy + +header = '\n\n *************************************** ' +footer = ' *************************************** \n ' + + +def ensemble_smoother_pipeline(dacycle, platform, dasystem, samples, statevector, obsoperator, optimizer): + """ The main point of entry for the pipeline """ + sys.path.append(os.getcwd()) + + samples = samples if isinstance(samples,list) else [samples] + + logging.info(header + "Initializing current cycle" + footer) + start_job(dacycle, dasystem, platform, statevector, samples, obsoperator) + + prepare_state(dacycle, statevector) + + sample_state(dacycle, samples, statevector, obsoperator) + + invert(dacycle, statevector, optimizer) + + advance(dacycle, samples, statevector, obsoperator) + + save_and_submit(dacycle, statevector) + logging.info("Cycle finished...exiting pipeline") + + + +def forward_pipeline(dacycle, platform, dasystem, samples, statevector, obsoperator): + """ The main point of entry for the pipeline """ + sys.path.append(os.getcwd()) + + samples = samples if isinstance(samples,list) else [samples] + + logging.info(header + "Initializing current cycle" + footer) + start_job(dacycle, dasystem, platform, statevector, samples, obsoperator) + + if 'forward.savestate.exceptsam' in dacycle: + sam = (dacycle['forward.savestate.exceptsam'].upper() in ["TRUE","T","YES","Y"]) + else: + sam = False + + if 'forward.savestate.dir' in dacycle: + fwddir = dacycle['forward.savestate.dir'] + else: + logging.debug("No forward.savestate.dir key found in rc-file, proceeding with self-constructed prior parameters") + fwddir = False + + if 'forward.savestate.legacy' in dacycle: + legacy = (dacycle['forward.savestate.legacy'].upper() in ["TRUE","T","YES","Y"]) + else: + legacy = False + logging.debug("No forward.savestate.legacy key found in rc-file") + + if not fwddir: + # Simply make a prior statevector using the normal method + prepare_state(dacycle, statevector)#LU tutaj zamiast tego raczej to stworzenie nowej kowariancji i ensembli bo pozostale rzeczy sa na gorze i na doel. + else: + # Read prior information from another simulation into the statevector. + # This loads the results from another assimilation experiment into the current statevector + + if sam: + filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate_%s.nc'%dacycle['time.start'].strftime('%Y%m%d')) + #filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate.nc') + statevector.read_from_file_exceptsam(filename, 'prior') + elif not legacy: + filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate_%s.nc'%dacycle['time.start'].strftime('%Y%m%d')) + statevector.read_from_file(filename, 'prior') + else: + filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate.nc') + statevector.read_from_legacy_file(filename, 'prior') + + + # We write this "prior" statevector to the restart directory, so we can later also populate it with the posterior statevector + # Note that we could achieve the same by just copying the wanted forward savestate.nc file to the restart folder of our current + # experiment, but then it would already contain a posterior field as well which we will try to write in save_and_submit. + # This could cause problems. Moreover, this method allows us to read older formatted savestate.nc files (legacy) and write them into + # the current format through the "write_to_file" method. + + savefilename = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['time.start'].strftime('%Y%m%d')) + statevector.write_to_file(savefilename, 'prior') + + # Now read optimized fluxes which we will actually use to propagate through the system + + if not fwddir: + # if there is no forward dir specified, we simply run forward with unoptimized prior fluxes in the statevector + logging.info("Running forward with prior savestate from: %s"%savefilename) + + else: + # Read posterior information from another simulation into the statevector. + # This loads the results from another assimilation experiment into the current statevector + + if sam: + statevector.read_from_file_exceptsam(filename, 'opt') + elif not legacy: + statevector.read_from_file(filename, 'opt') + else: + statevector.read_from_legacy_file(filename, 'opt') + + logging.info("Running forward with optimized savestate from: %s"%filename) + + # Finally, we run forward with these parameters + advance(dacycle, samples, statevector, obsoperator) + + # In save_and_submit, the posterior statevector will be added to the savestate.nc file, and it is added to the copy list. + # This way, we have both the prior and posterior data from another run copied into this assimilation, for later analysis. + + save_and_submit(dacycle, statevector) + + logging.info("Cycle finished...exiting pipeline") +#################################################################################################### + +def analysis_pipeline(dacycle, platform, dasystem, samples, statevector): + """ Main entry point for analysis of ctdas results """ + + from da.analysis.cteco2.expand_fluxes import save_weekly_avg_1x1_data, save_weekly_avg_state_data, save_weekly_avg_tc_data, save_weekly_avg_ext_tc_data, save_weekly_avg_agg_data + from da.analysis.cteco2.expand_molefractions import write_mole_fractions + from da.analysis.cteco2.summarize_obs import summarize_obs + from da.analysis.cteco2.time_avg_fluxes import time_avg + + logging.info(header + "Starting analysis" + footer) + + dasystem.validate() + dacycle.dasystem = dasystem + dacycle.daplatform = platform + dacycle.setup() + statevector.setup(dacycle) + + logging.info(header + "Starting mole fractions" + footer) + + write_mole_fractions(dacycle) + summarize_obs(dacycle['dir.analysis']) + + logging.info(header + "Starting weekly averages" + footer) + + save_weekly_avg_1x1_data(dacycle, statevector) + save_weekly_avg_state_data(dacycle, statevector) + save_weekly_avg_tc_data(dacycle, statevector) + save_weekly_avg_ext_tc_data(dacycle) + save_weekly_avg_agg_data(dacycle,region_aggregate='transcom') + save_weekly_avg_agg_data(dacycle,region_aggregate='transcom_extended') + save_weekly_avg_agg_data(dacycle,region_aggregate='olson') + save_weekly_avg_agg_data(dacycle,region_aggregate='olson_extended') + save_weekly_avg_agg_data(dacycle,region_aggregate='country') + + logging.info(header + "Starting monthly and yearly averages" + footer) + + time_avg(dacycle,'flux1x1') + time_avg(dacycle,'transcom') + time_avg(dacycle,'transcom_extended') + time_avg(dacycle,'olson') + time_avg(dacycle,'olson_extended') + time_avg(dacycle,'country') + + logging.info(header + "Finished analysis" + footer) + + + +def archive_pipeline(dacycle, platform, dasystem): + """ Main entry point for archiving of output from one disk/system to another """ + + if not 'task.rsync' in dacycle: + logging.info('rsync task not found, not starting automatic backup...') + return + else: + logging.info('rsync task found, starting automatic backup...') + + for task in dacycle['task.rsync'].split(): + sourcedirs = dacycle['task.rsync.%s.sourcedirs'%task] + destdir = dacycle['task.rsync.%s.destinationdir'%task] + + rsyncflags = dacycle['task.rsync.%s.flags'%task] + + # file ID and names + jobid = dacycle['time.end'].strftime('%Y%m%d') + targetdir = os.path.join(dacycle['dir.exec']) + jobfile = os.path.join(targetdir, 'jb.rsync.%s.%s.jb' % (task,jobid) ) + logfile = os.path.join(targetdir, 'jb.rsync.%s.%s.log' % (task,jobid) ) + # Template and commands for job + jobparams = {'jobname':"r.%s" % jobid, 'jobnodes': '1', 'jobtime': '1:00:00', 'joblog': logfile, 'errfile': logfile} + + if platform.ID == 'cartesius': + jobparams['jobqueue'] = 'staging' + + template = platform.get_job_template(jobparams) + for sourcedir in sourcedirs.split(): + execcommand = "\nrsync %s %s %s\n" % (rsyncflags, sourcedir,destdir,) + template += execcommand + + # write and submit + platform.write_job(jobfile, template, jobid) + jobid = platform.submit_job(jobfile, joblog=logfile) + + + +def start_job(dacycle, dasystem, platform, statevector, samples, obsoperator): + """ Set up the job specific directory structure and create an expanded rc-file """ + + dasystem.validate() + dacycle.dasystem = dasystem + dacycle.daplatform = platform + dacycle.setup() + #statevector.dacycle = dacycle # also embed object in statevector so it can access cycle information for I/O etc + #samples.dacycle = dacycle # also embed object in samples object so it can access cycle information for I/O etc + #obsoperator.dacycle = dacycle # also embed object in obsoperator object so it can access cycle information for I/O etc + obsoperator.setup(dacycle) # Setup Observation Operator + statevector.setup(dacycle) + + +def prepare_state(dacycle, statevector): + """ Set up the input data for the forward model: obs and parameters/fluxes""" + + # We now have an empty statevector object that we need to populate with data. If this is a continuation from a previous cycle, we can read + # the previous statevector values from a NetCDF file in the restart directory. If this is the first cycle, we need to populate the statevector + # with new values for each week. After we have constructed the statevector, it will be propagated by one cycle length so it is ready to be used + # in the current cycle + + logging.info(header + "starting prepare_state" + footer) + + if not dacycle['time.restart']: + + # Fill each week from n=1 to n=nlag with a new ensemble + for n in range(statevector.nlag): + date = dacycle['time.start'] + datetime.timedelta(days=(n + 0.5) * int(dacycle['time.cycle'])) + cov = statevector.get_covariance(date, dacycle) + statevector.make_new_ensemble(n, cov) + + else: + # Read the statevector data from file + #saved_sv = os.path.join(dacycle['dir.restart.current'], 'savestate.nc') + saved_sv = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['da.restart.tstamp'].strftime('%Y%m%d')) + statevector.read_from_file(saved_sv) # by default will read "opt"(imized) variables, and then propagate + + # Now propagate the ensemble by one cycle to prepare for the current cycle + statevector.propagate(dacycle) + + # Finally, also write the statevector to a file so that we can always access the a-priori information + current_sv = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['time.start'].strftime('%Y%m%d')) + statevector.write_to_file(current_sv, 'prior') # write prior info + + + +def sample_state(dacycle, samples, statevector, obsoperator): + """ Sample the filter state for the inversion """ + + # Before a forecast step, save all the data to a save/tmp directory so we can later recover it before the propagation step. + # This is especially important for: + # (i) The transport model restart data which holds the background mole fractions. This is needed to run the model one cycle forward + # (ii) The random numbers (or the seed for the random number generator) so we can recreate the ensembles if needed + + #status = dacycle.MoveSaveData(io_option='store',save_option='partial',filter=[]) + #msg = "All restart data have been copied to the save/tmp directory for future use" ; logging.debug(msg) + logging.info(header + "starting sample_state" + footer) + nlag = int(dacycle['time.nlag']) + logging.info("Sampling model will be run over %d cycles" % nlag) + + obsoperator.get_initial_data() + + for lag in range(nlag): + logging.info(header + ".....Ensemble Kalman Filter at lag %d" % (lag + 1)) + + ############# Perform the actual sampling loop ##################### + + sample_step(dacycle, samples, statevector, obsoperator, lag) + + logging.debug("statevector now carries %d samples" % statevector.nobs) + + + +def sample_step(dacycle, samples, statevector, obsoperator, lag, advance=False): + """ Perform all actions needed to sample one cycle """ + + # First set up the information for time start and time end of this sample + dacycle.set_sample_times(lag) + + startdate = dacycle['time.sample.start'] + enddate = dacycle['time.sample.end'] + dacycle['time.sample.window'] = lag + dacycle['time.sample.stamp'] = "%s_%s" % (startdate.strftime("%Y%m%d%H"), enddate.strftime("%Y%m%d%H")) + + logging.info("New simulation interval set : ") + logging.info(" start date : %s " % startdate.strftime('%F %H:%M')) + logging.info(" end date : %s " % enddate.strftime('%F %H:%M')) + logging.info(" file stamp: %s " % dacycle['time.sample.stamp']) + + + # Implement something that writes the ensemble member parameter info to file, or manipulates them further into the + # type of info needed in your transport model + + statevector.write_members_to_file(lag, dacycle['dir.input'], obsoperator=obsoperator) + + for sample in samples: + + sample.setup(dacycle) + + # Read observations + perform observation selection + sample.add_observations() + + # Add model-data mismatch to all samples, this *might* use output from the ensemble in the future?? + sample.add_model_data_mismatch(dacycle.dasystem['obs.sites.rc']) + + sampling_coords_file = os.path.join(dacycle['dir.input'], sample.get_samples_type()+'_coordinates_%s.nc' % dacycle['time.sample.stamp']) + logging.debug('sampling_coords_file = %s' % sampling_coords_file) + sample.write_sample_coords(sampling_coords_file) + + # Write filename to dacycle, and to output collection list + dacycle['ObsOperator.inputfile.'+sample.get_samples_type()] = sampling_coords_file + + del sample + + # Run the observation operator + obsoperator.run_forecast_model(samples) + + + # Read forecast model samples that were written to NetCDF files by each member. Add them to the exisiting + # Observation object for each sample loop. This data fill be written to file in the output folder for each sample cycle. + + # We retrieve all model samples from one output file written by the ObsOperator. If the ObsOperator creates + # one file per member, some logic needs to be included to merge all files!!! + + for i in range(len(samples)): + if os.path.exists(dacycle['ObsOperator.inputfile.'+samples[i].get_samples_type()]): + samples[i].add_simulations(obsoperator.simulated_file[i]) + logging.debug("Simulated values read, continuing pipeline") + else: + logging.warning("No simulations added, because sample input file does not exist") + + # Now add the observations that need to be assimilated to the statevector. + # Note that obs will only be added to the statevector if either this is the first step (restart=False), or lag==nlag + # This is to make sure that the first step of the system uses all observations available, while the subsequent + # steps only optimize against the data at the front (lag==nlag) of the filter. This way, each observation is used only + # (and at least) once # in the assimilation + + + if not advance: + logging.debug('time.restart = %s' %dacycle['time.restart']) + if dacycle['time.restart'] == False or lag == int(dacycle['time.nlag']) - 1: + statevector.obs_to_assimilate += (copy.deepcopy(samples),) + for sample in samples: + statevector.nobs += sample.getlength() + del sample + logging.info('nobs = %i' %statevector.nobs) + logging.debug("Added samples from the observation operator to the assimilated obs list in the statevector") + + else: + statevector.obs_to_assimilate += (None,) + + + +def invert(dacycle, statevector, optimizer): + """ Perform the inverse calculation """ + logging.info(header + "starting invert" + footer) + + if statevector.nobs <= 1: #== 0: + logging.info('List with observations to assimilate is empty, skipping invert step and continuing without statevector update...') + return + + dims = (int(dacycle['time.nlag']), + int(dacycle['da.optimizer.nmembers']), + int(dacycle.dasystem['nparameters']), + statevector.nobs) + + if 'opt.algorithm' not in dacycle.dasystem: + logging.info("There was no minimum least squares algorithm specified in the DA System rc file (key : opt.algorithm)") + logging.info("...using serial algorithm as default...") + optimizer.set_algorithm('Serial') + elif dacycle.dasystem['opt.algorithm'] == 'serial': + logging.info("Using the serial minimum least squares algorithm to solve ENKF equations") + optimizer.set_algorithm('Serial') + elif dacycle.dasystem['opt.algorithm'] == 'bulk': + logging.info("Using the bulk minimum least squares algorithm to solve ENKF equations") + optimizer.set_algorithm('Bulk') + + optimizer.setup(dims) + optimizer.state_to_matrix(statevector) + + diagnostics_file = os.path.join(dacycle['dir.output'], 'optimizer.%s.nc' % dacycle['time.start'].strftime('%Y%m%d')) + + optimizer.write_diagnostics(diagnostics_file, 'prior') + optimizer.set_localization(dacycle['da.system.localization']) + + if optimizer.algorithm == 'Serial': + optimizer.serial_minimum_least_squares() + else: + optimizer.bulk_minimum_least_squares() + + optimizer.matrix_to_state(statevector) + optimizer.write_diagnostics(diagnostics_file, 'optimized') + + + +def advance(dacycle, samples, statevector, obsoperator): + """ Advance the filter state to the next step """ + + # This is the advance of the modeled CO2 state. Optionally, routines can be added to advance the state vector (mean+covariance) + + # Then, restore model state from the start of the filter + logging.info(header + "starting advance" + footer) + logging.info("Sampling model will be run over 1 cycle") + + obsoperator.get_initial_data() + + sample_step(dacycle, samples, statevector, obsoperator, 0, True) + + dacycle.restart_filelist.extend(obsoperator.restart_filelist) + dacycle.output_filelist.extend(obsoperator.output_filelist) + logging.debug("Appended ObsOperator restart and output file lists to dacycle for collection ") + + for sample in samples: + dacycle.output_filelist.append(dacycle['ObsOperator.inputfile.'+sample.get_samples_type()]) + logging.debug("Appended Observation filename to dacycle for collection: %s"%(dacycle['ObsOperator.inputfile.'+sample.get_samples_type()])) + + sampling_coords_file = os.path.join(dacycle['dir.input'], sample.get_samples_type()+'_coordinates_%s.nc' % dacycle['time.sample.stamp']) + if os.path.exists(sampling_coords_file): + outfile = os.path.join(dacycle['dir.output'], sample.get_samples_type() + '_auxiliary_%s.nc' % dacycle['time.sample.stamp']) + sample.write_sample_auxiliary(outfile) + else: logging.warning("Sample auxiliary output not written, because input file does not exist (no samples found in obspack)") + del sample + + + +def save_and_submit(dacycle, statevector): + """ Save the model state and submit the next job """ + logging.info(header + "starting save_and_submit" + footer) + + filename = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['time.start'].strftime('%Y%m%d')) + statevector.write_to_file(filename, 'opt') + + dacycle.output_filelist.append(filename) + dacycle.finalize() diff --git a/da/rc/wrfchem/dasystem_wrfchem.rc b/da/rc/wrfchem/dasystem_wrfchem.rc new file mode 100644 index 0000000000000000000000000000000000000000..fc85dcf7625b42d1b5d79cb2c66fc5370aa29b21 --- /dev/null +++ b/da/rc/wrfchem/dasystem_wrfchem.rc @@ -0,0 +1,77 @@ +! CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +! Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +! updates of the code. See also: http://www.carbontracker.eu. +! +! This program is free software: you can redistribute it and/or modify it under the +! terms of the GNU General Public License as published by the Free Software Foundation, +! version 3. This program is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +! FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along with this +! program. If not, see <http://www.gnu.org/licenses/>. + +!!! Info for the CarbonTracker data assimilation system + +!datadir : /projects/0/ctdas/input/ctdas_2012/ + + +! For ObsPack +! For WRF, this is only a dummy until sampling point observations is +! implemented +!obspack.input.id : obspack_co2_1_GLOBALVIEWplus_v2.1_2016-09-02 +!obspack.input.dir : ${datadir}/obspacks/${obspack.input.id} +!obs.sites.rc : ${obspack.input.dir}/summary/sites_weights_Dec2016.rc +obs.sites.rc : dummy + +! For column data +! sample.in.ctdas is obsolete, should be removed +sample.in.ctdas : False +! obs.column.rc here only contains "obs.rejection.threshold : <N>" +obs.column.rc : /path/to/xco2_weights.rc +obs.column.input.dir : /dir/with/column/observation/files +obs.column.ncfile : column_observation_file_pattern_<YYYYMMDD>.nc + +! Selection criteria based on variables stored in data file +obs.column.selection.variables : quality_flag +obs.column.selection.criteria : == 0 + +! Any value but "parametrization" defaults to model uncertainty of 1 +! (in the units of the observations, so e.g. 1 ppm for CO2) +mdm.calculation : default + +! Covariances +! Scaling factor factor sigma, no covariances +! A value of 1 means 100% prior flux uncertainty. +! For one flux process: +! sigma_scale : 1 +! For multiple flux processes (see n_emis_proc below): +sigma_scale_1 : 0.3 +sigma_scale_2 : 0.5 +sigma_scale_3 : 0.5 + +! Uncomment sigma_offset for optimizing a constant offset of WRF initial +! and boundary conditions. Unit as in observations, so e.g. ppm for CO2. +! This is only allowed in runs with only one cycle, because in +! subsequent cycles, the initial conditions couldn't be optimized in a +! consistent way. +! sigma_offset : 0.5 + +! Seed for numpy random number generator? +!random.seed : 4385 +!random.seed.init: ${datadir}/randomseedinit.pickle +random.seed.init: /path/to/randomseedinit.pickle + + +!!!!!!!!!!!!!!!!!! STATEVECTOR !!!!!!!!!!!!!!!!!! +! nparameters = number of flux parameters plus 1 in case an offset is +! optimized +! Example: +! For 25 state vector elements and 3 processes (e.g. GPP, respiration, +! anthropogenic emissions), set nparameters to 75. If, in addition, an +! initial/boundary condition offset is optimized by setting +! sigma_offset, set nparameters to 76. + +nparameters : 75 +n_emis_proc : 3 +regionsfile : /path/to/regionsfile.nc diff --git a/da/rc/wrfchem/obsoper_wrfchem.rc b/da/rc/wrfchem/obsoper_wrfchem.rc new file mode 100644 index 0000000000000000000000000000000000000000..d0cb1cc9be071069a7989acd8b5a1d25030f3d9c --- /dev/null +++ b/da/rc/wrfchem/obsoper_wrfchem.rc @@ -0,0 +1,141 @@ +! CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +! Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +! updates of the code. See also: http://www.carbontracker.eu. +! +! This program is free software: you can redistribute it and/or modify it under the +! terms of the GNU General Public License as published by the Free Software Foundation, +! version 3. This program is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +! FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License along with this +! program. If not, see <http://www.gnu.org/licenses/>. + + +! source_dir +! Directory like WRF/run/ containing everything that is necessary to run WRF, +! having already run real.exe: +! - wrf.exe +! - namelist.input +! - wrfbdy_d01 (already containing values for each ensemble member) +! - wrfinput_dXX (already containing values for each ensemble member) +! - wrfchemi_* (containing at the prior fluxes in field flux_optim and +! flux_fix (if defined), and fields for ensemble members) + +source_dir : /dir/like/WRF/run/with/all/inputs/ + +! run_dir +! The directory where all WRF runs will be performed. +! Files from source_dir will be copied/linked here as needed. +run_dir : /dir/for/running/WRF/within/CTDAS/ + +! tracer and flux to optimize (as in ensemble tracer definitions in WRF, but +! without the _NNN suffix for the member number) +tracer_optim : CO2 +flux_optim : E_CO2 + +! Fixed flux to add to fluxes to optimize (comment if there are none) +! flux_fix : E_FIX + +! obs.column.footprint_samples_dim (do not change) +! For high resolution simulations, the satellite footprint can be bigger +! than one model pixel. If this option is 1, the model atmosphere is +! sampled at the center coordinates of the observation. If this option +! is > 1, the model atmosphere is sampled at multiple points within the +! footprint corners and the result is averaged. The sampling points are +! aranged as an n x n rectangle. +! Example: if obs.column.footprint_samples_dim=3, the model is sampled +! at 3*3=9 points in the footprint area. +! Must be set to 1. Code for larger values is in preparation. +obs.column.footprint_samples_dim : 1 + + +! Option to not run WRF in case output files are already present +! for the current time period. This is a development option that +! is useful for testing something in CTDAS after the transport +! model is run. +! True/False (default if not set: False) +! no_rerun_wrf : False + +! Option to save a copy of wrfout*, wrfrst* and rsl* files each time +! WRF is run. This is another option for development/debugging and not +! for production. The purpose is to rerun CTDAS without actually running the +! transport model, because WRF usually takes a long time to run. It can also +! be used to check wrfout contents of a cycle, because they get overwritten +! by subsequent steps. I use it to debug the optimization step without +! rerunning wrf for the da step for each try. So far this recovery step +! needs to be done manually, though. So for a new run I copy the wrfout +! files back to the WRF run directory, and set no_rerun_wrf (above) to +! True. Then I start CTDAS and it will see that the wrfout for the period +! already exist and skip running WRF. Note that this will also happen in +! the advance step, because wrfout files for its period already exist. If +! you want CTDAS to do a proper advance step, you need to wait for the +! optimization to start, then while it is still running move the wrfout files +! out of the run directory again. Then when the advance step starts, it will +! rerun WRF because it doesn't find wrfout files. +save_wrf_step : False + +! WRF-Chem and sampling its output are implemented as subprocesses (job +! steps). All remaining options in this file deal with starting them. +! These options and definitions could be made part of the platform +! object in a future update. However, in the original CTDAS, running the +! transport model TM5 is also controlled by TM5-specific options. +! The options here work with the SLURM scheduler on the Dutch national +! supercomputer cartesius. + +! The processes are started with the python module subprocess, which +! takes a shell command as option. The shell command for WRF is: +! run_command + " " (+ run_ntask_opt + ntasks) (+ " " + run_t_opt + run_t) + " ./wrf.exe >& log/wrf.log" +! The shell command for python scripts is: +! run_command_1task + " " (+ run_mem_opt + <mem_str> + mem_unit_str_MB) (+ " " + run_t_opt + run_t) + " python <python skript and options>" +! Variables enclosed by "<>" are computed online. The others are set +! below. Options in brackets are only appended to the command string if +! they are not empty in the settings below. + +! Command to start external process +run_command : srun + +! Some python scripts are launched that require only 1 task. +! For this case, the scheduler SLURM requires to specifically request +! -N1 because otherwise slurm apparently sometimes tries to allocate +! one task to more than one node. See here: +! https://stackoverflow.com/questions/24056961/running-slurm-script-with-multiple-nodes-launch-job-steps-with-1-task +run_command_1task : srun -n1 -N1 + +! Define how to pass options to processes started with run_command or +! run_command_1task +! Name of the option of run_command that defines requested memory +! (total, not per cpu). If this is empty, memory limit will not be +! specified +run_mem_opt : --mem= + +! The operator computes memory in MB, might have to add a string +mem_unit_str_MB : M + +! Name of the option of run_command that defines requested wallclock time +! If this is empty, wallclock limit will not be specified. +run_t_opt : -t + +! Name of the option of run_command that defines number of requested tasks +! If this is empty, number of tasks will not be specified +run_ntask_opt : -n + +! Time that a subprocess should request (this is probably not important +! because it's run within the same resource allocation as the parent - +! not tested working without though) +run_t : 0:20:00 + +! Number of tasks that can run on one node +! This is used for computing how much memory to allocate per process of +! calls wrfout_sampler. +cpus_per_node : 24 + +! Available memory per node +! mem per node: On cartesius technically 64, but at least in interactive +! shells on cartesius a bit of a buffer is needed. The value here was +! determined with calls like "srun --mem=61G -n1 sleep 1" inside an +! interactive shell. +mem_per_node : 60 + + + diff --git a/da/statevectors/statevector_wrfchem.py b/da/statevectors/statevector_wrfchem.py new file mode 100644 index 0000000000000000000000000000000000000000..6236b22edf39732e2f25e62bf51c24967471ca9c --- /dev/null +++ b/da/statevectors/statevector_wrfchem.py @@ -0,0 +1,332 @@ +"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. +Users are recommended to contact the developers (wouter.peters@wur.nl) to receive +updates of the code. See also: http://www.carbontracker.eu. + +This program is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software Foundation, +version 3. This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with this +program. If not, see <http://www.gnu.org/licenses/>.""" +#!/usr/bin/env python +# ct_statevector_tools.py + +""" +.. module:: statevector +.. moduleauthor:: Friedemann Reum + +Revision History: +File created 2019. + +This module is an extension to any CTDAS statevector that overwrites +write_members_to_file to produce output in the format required by WRF-Chem. +Thus, it can inherit from any other +state vector. In this reference implementation, it inherits from +the baseclass statevector. + * :class:`~da.baseclasses.statevector.StateVector` + * :class:`~da.baseclasses.statevector.EnsembleMember` + +As usual, specific implementations of StateVector objects are done through inheritance form these baseclasses. An example of designing +your own baseclass StateVector we refer to :ref:`tut_chapter5`. + +.. autoclass:: da.baseclasses.statevector.StateVector + +.. autoclass:: da.baseclasses.statevector.EnsembleMember + +""" + +import os +import logging +import numpy as np +from datetime import timedelta +import da.tools.io4 as io +from da.statevectors.statevector_baseclass import StateVector, EnsembleMember + +identifier = 'Statevector for WRFChem' +version = '0.0' + +################### Begin Class WRFChemStateVector ################### +class WRFChemStateVector(StateVector): + """ + This module is an extension to any CTDAS statevector that overwrites + write_members_to_file to produce output in the format required by WRF-Chem. + Thus, it can inherit from any other state vector. In the current + implementation, it inherits from CO2GriddedStateVector. + + .. automethod:: da.baseclasses.statevector.StateVector.write_members_to_file + + """ + + def __init__(self): + self.ID = identifier + self.version = version + logging.info('Statevector object initialized: %s (%s)' % + (self.ID, self.version)) + + def setup(self, dacycle): + """ + Copied from baseclasses.statevector and + - Replaced transcom region with a dummy. + - Added do_offset + - Added overlaying processes + """ + + self.nlag = int(dacycle['time.nlag']) + self.nmembers = int(dacycle['da.optimizer.nmembers']) + self.nparams = int(dacycle.dasystem['nparameters']) + self.do_offset = "sigma_offset" in list(dacycle.dasystem.keys()) + # >>> Overlaying processes: + self.n_emis_proc = int(dacycle.dasystem['n_emis_proc']) + # Get number of flux parameters per process + nparams_proc = float(self.nparams - int(self.do_offset)) / float(self.n_emis_proc) + if int(nparams_proc)==nparams_proc: + self.nparams_proc = int(nparams_proc) + else: + raise ValueError("Number of emission processes doesn't divide number of flux parameters") + # <<< Overlaying processes + + self.nobs = 0 + + self.obs_to_assimilate = () # empty containter to hold observations to assimilate later on + + # These list objects hold the data for each time step of lag in the system. Note that the ensembles for each time step consist + # of lists of EnsembleMember objects, we define member 0 as the mean of the distribution and n=1,...,nmembers as the spread. + + self.ensemble_members = list(range(self.nlag)) + + for n in range(self.nlag): + self.ensemble_members[n] = [] + + + # This specifies the file to read with the gridded mask at 1x1 degrees. Each gridbox holds a number that specifies the parametermember + # that maps onto it. From this map, a dictionary is created that allows a reverse look-up so that we can map parameters to a grid. + + mapfile = os.path.join(dacycle.dasystem['regionsfile']) + ncf = io.ct_read(mapfile, 'read') + self.gridmap = ncf.get_variable('regions').astype(int) + # >>> edit freum: Not using transcom regions mapping in regional setup with wrfchem + #self.tcmap = ncf.get_variable('transcom_regions') + # <<< edit freum + ncf.close() + + #logging.debug("A TransCom map on 1x1 degree was read from file %s" % dacycle.dasystem['regionsfile']) + #logging.debug("A parameter map on 1x1 degree was read from file %s" % dacycle.dasystem['regionsfile']) + logging.debug("A parameter map was read from file %s" % dacycle.dasystem['regionsfile']) + + # Create a dictionary for state <-> gridded map conversions + + nparams = self.gridmap.max() + + # >>> do_offset + # Check that nparams is compatible with the one + # given in dasystem rc-file + + # if nparams + int(self.do_offset) != self.nparams: + if nparams*self.n_emis_proc + int(self.do_offset) != self.nparams: # overlaying parameters + raise ValueError("Number of parameters in regionsfile incompatible with nparameters in dasystem-rcfile.") + # <<< do_offset + + self.griddict = {} + for r in range(1, int(nparams) + 1): + sel = np.nonzero(self.gridmap.flatten() == r) + if len(sel[0]) > 0: + self.griddict[r] = sel + + logging.debug("A dictionary to map grids to states and vice versa was created") + + # Create a matrix for state <-> TransCom conversions + # >>> edit freum + # Regional simulations (wrfchem): only dummy tcmatrix. Would only be used in analysis pipeline, which I don't run for wrfchem. + + self.tcmatrix = np.zeros((self.nparams, 23), 'float') + + # for r in range(1, self.nparams + 1): + # sel = np.nonzero(self.gridmap.flatten() == r) + # if len(sel[0]) < 1: + # continue + # else: + # n_tc = set(self.tcmap.flatten().take(sel[0])) + # if len(n_tc) > 1: + # logging.error("Parameter %d seems to map to multiple TransCom regions (%s), I do not know how to handle this" % (r, n_tc)) + # raise ValueError + # self.tcmatrix[r - 1, int(n_tc.pop()) - 1] = 1.0 + + # logging.debug("A matrix to map states to TransCom regions and vice versa was created") + logging.debug("A dummy matrix to map states to TransCom regions and vice versa was created") + # <<< edit freum + + # Create a mask for species/unknowns + + self.make_species_mask() + + def make_new_ensemble(self, lag, covariancematrix=None): + """ + Copied from baseclass statevector, but switched to + fresh initialization of each new ensemble (instead of + with previous lags) and handling of self.do_offset. + + """ + + if covariancematrix is None: + covariancematrix = np.identity(self.nparams) + + # Make a cholesky decomposition of the covariance matrix + + + try: + _, s, _ = np.linalg.svd(covariancematrix) + except: + s = np.linalg.svd(covariancematrix, full_matrices=1, compute_uv=0) #Cartesius fix + dof = np.sum(s) ** 2 / sum(s ** 2) + C = np.linalg.cholesky(covariancematrix) + + logging.debug('Cholesky decomposition has succeeded ') + logging.info('Appr. degrees of freedom in covariance matrix is %s' % (int(dof))) + + + # Create mean values + + newmean = np.ones(self.nparams, float) # standard value for a new time step is 1.0 + + # >>> do_offset + if self.do_offset: + newmean[-1] = 0. + # <<< do_offset + + # If this is not the start of the filter, average previous two optimized steps into the mix + + # freum vvvv + + # Switch off the initialization with previously optimized state vector + # for my project + #if lag == self.nlag - 1 and self.nlag >= 3: + # newmean += self.ensemble_members[lag - 1][0].param_values + \ + # self.ensemble_members[lag - 2][0].param_values + # newmean = newmean / 3.0 + + # freum ^^^^ + + # Create the first ensemble member with a deviation of 0.0 and add to list + + newmember = EnsembleMember(0) + newmember.param_values = newmean.flatten() # no deviations + self.ensemble_members[lag].append(newmember) + + # Create members 1:nmembers and add to ensemble_members list + + for member in range(1, self.nmembers): + rands = np.random.randn(self.nparams) + + newmember = EnsembleMember(member) + newmember.param_values = np.dot(C, rands) + newmean + self.ensemble_members[lag].append(newmember) + + logging.debug('%d new ensemble members were added to the state vector # %d' % (self.nmembers, (lag + 1))) + + + def write_members_to_file(self, lag, outdir, endswith='.nc', obsoperator=None): + """ + :param: lag: Which lag step of the filter to write, must lie in range [1,...,nlag] + :param: outdir: Directory where to write files + :param: obsoperator: observationoperator object + :param: endswith: Optional label to add to the filename, default is simply .nc + :rtype: None + + Write ensemble member information to file for transport model. + If observation operator is None or its ID is anything except + "wrfchem", the method of the baseclass is called. If the + observation operator ID is "wrfchem", the method of the + observation operator is called. The reason to implement it + there was originally to avoid teaching the statevector + object about WRF domains. + """ + + # Decide what to call based on obsoperator + if obsoperator is None: + # logging.info("Calling statevector._write_members_to_file_standard") + # self._write_members_to_file_standard(lag, outdir, endswith) + logging.info("Calling super().write_members_to_file") + super().write_members_to_file(lag, outdir, endswith, obsoperator) + else: + if obsoperator.ID=='wrfchem': + logging.info("Calling obsoperator.write_members_to_file") + obsoperator.write_members_to_file(lag, outdir, self) + else: # call the standard method + # logging.info("Calling statevector._write_members_to_file_standard") + # self._write_members_to_file_standard(lag, outdir, endswith) + logging.info("Calling super().write_members_to_file") + super().write_members_to_file(lag, outdir, endswith, obsoperator) + + + # def _write_members_to_file_standard(self, lag, outdir, endswith='.nc'): + # """ + # Copy of baseclass write_members_to_file. Documented in baseclass + # statevector. + # Obsolete, replaced with calling super().write_members_to_file() + # """ + # + # members = self.ensemble_members[lag] + # + # for mem in members: + # filename = os.path.join(outdir, 'parameters.%03d%s' % (mem.membernumber, endswith)) + # ncf = io.CT_CDF(filename, method='create') + # dimparams = ncf.add_params_dim(self.nparams) + # dimgrid = ncf.add_latlon_dim() + # + # data = mem.param_values + # + # savedict = io.std_savedict.copy() + # savedict['name'] = "parametervalues" + # savedict['long_name'] = "parameter_values_for_member_%d" % mem.membernumber + # savedict['units'] = "unitless" + # savedict['dims'] = dimparams + # savedict['values'] = data + # savedict['comment'] = 'These are parameter values to use for member %d' % mem.membernumber + # ncf.add_data(savedict) + # + # griddata = self.vector2grid(vectordata=data) + # + # savedict = io.std_savedict.copy() + # savedict['name'] = "parametermap" + # savedict['long_name'] = "parametermap_for_member_%d" % mem.membernumber + # savedict['units'] = "unitless" + # savedict['dims'] = dimgrid + # savedict['values'] = griddata.tolist() + # savedict['comment'] = 'These are gridded parameter values to use for member %d' % mem.membernumber + # ncf.add_data(savedict) + # + # ncf.close() + # + # logging.debug('Successfully wrote data from ensemble member %d to file (%s) ' % (mem.membernumber, filename)) + + def get_covariance(self, date, dacycle): + """ + Return a prior statevector covariance matrix + Here, the covariance matrix is diagonal and the values + are the squares of the "sigma" parameters in the + dasystem rc file. + """ + + if self.n_emis_proc==1: + sigma = float(dacycle.dasystem['sigma_scale']) + covariancematrix = np.identity(self.nparams) * sigma**2 + else: + # overlaying parameters: + covariancematrix = np.identity(self.nparams) + for nproc in range(1, self.n_emis_proc+1): + sigma_nproc = float(dacycle.dasystem['sigma_scale_' + str(nproc)]) + ind = list(range((nproc-1)*self.nparams_proc, nproc*self.nparams_proc)) + covariancematrix[ind, ind] = sigma_nproc**2 + if self.do_offset: + covariancematrix[-1, -1] = float(dacycle.dasystem["sigma_offset"])**2 + + + return covariancematrix + +################### End Class StateVector ################### + +if __name__ == "__main__": + pass + diff --git a/da/tools/wrfchem/__init__.py b/da/tools/wrfchem/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/da/tools/wrfchem/utilities.py b/da/tools/wrfchem/utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..157149ff00864e720276f86875b8b0ee96e154d9 --- /dev/null +++ b/da/tools/wrfchem/utilities.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + +Created on Wed Sep 18 16:03:02 2019 + +@author: friedemann +""" + +import os +import glob +import logging +import subprocess +import tempfile +import copy +import netCDF4 as nc +import numpy as np + +class utilities(object): + """ + Collection of utilities for wrfchem observation operator + that do not depend on other CTDAS modules + """ + + def __init__(self): + pass + + @staticmethod + def get_slicing_ids(N, nproc, nprocs): + """ + Purpose + ------- + For parallel processing, figure out which samples to process + by this process. + + Parameters + ---------- + N : int + Length to slice + nproc : int + id of this process (0... nprocs-1) + nprocs : int + Number of processes that work on the task. + + Output + ------ + Slicing indices id0, id1 + Usage + ----- + ..code-block:: python + + id0, id1 = get_slicing_ids(N, nproc, nprocs) + field[id0:id1, ...] + """ + + f0 = float(nproc)/float(nprocs) + id0 = int(np.floor(f0*N)) + + f1 = float(nproc+1)/float(nprocs) + id1 = int(np.floor(f1*N)) + + if id0==id1: + raise ValueError("id0==id1. Probably too many processes.") + return id0, id1 + + + + @classmethod + def cat_ncfiles(cls, path, in_arg, cat_dim, out_file, in_pattern=False, rm_original=True): + """ + Combine output of all processes into 1 file + If in_pattern, a pattern is provided instead of a file list. + This has the advantage that it can be interpreted by the shell, + because there are problems with long argument lists. + + This calls ncrcat from the nco library. If nco is not available, + rewrite this function. Note: I first tried to do this with + "cdo cat", but it messed up sounding_id + (see https://code.mpimet.mpg.de/boards/1/topics/908) + """ + + # To preserve dimension names, we start from one of the existing + # slice files instead of a new file. + + # Do this in path to avoid long command line arguments and history + # entries in outfile. + cwd = os.getcwd() + os.chdir(path) + + if in_pattern: + if not isinstance(in_arg, str): + raise TypeError("in_arg must be a string if in_pattern is True.") + file_pattern = in_arg + in_files = glob.glob(file_pattern) + else: + if isinstance(in_arg, list): + raise TypeError("in_arg must be a list if in_pattern is False.") + in_files = in_arg + + if len(in_files) == 0: + logging.error("Nothing to do.") + # Change back to previous directory + os.chdir(cwd) + return + + # Sorting is important! + in_files.sort() + + # ncrcat needs total number of soundings, count + Nobs = 0 + for f in in_files: + ncf = nc.Dataset(f, "r") + Nobs += len(ncf.dimensions[cat_dim]) + ncf.close() + + # Cat files + cmd_ = "ncrcat -h -O -d " + cat_dim + ",0,%d"%(Nobs-1) + if in_pattern: + cmd = cmd_ + " " + file_pattern + " " + out_file + # If PIPE is used here, it gets clogged, and the process + # stops without error message (see also + # https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/) + # Hence, piping the output to a temporary file. + proc = subprocess.Popen(cmd, shell=True, + stdout=tempfile.TemporaryFile(), + stderr=tempfile.TemporaryFile()) + else: + cmdsplt = cmd_.split() + in_files + [out_file] + proc = subprocess.Popen(cmdsplt, stdout=tempfile.TemporaryFile(), stderr=tempfile.TemporaryFile()) + cmd = " ".join(cmdsplt) + + proc.wait() + + # This is probably useless since the output is piped to a + # tempfile. + retcode = cls.check_out_err(proc) + + if retcode != 0: + msg = "Something went wrong in the sampling. Command: " + cmd + logging.error(msg) + raise OSError(msg) + + # Delete slice files + if rm_original: + logging.info("Deleting slice files.") + for f in in_files: + os.remove(f) + + logging.info("Sampled WRF output written to file.") + + # Change back to previous directory + os.chdir(cwd) + + + @staticmethod + def check_out_err(process): + """Displays stdout and stderr, returns returncode of the + process. + """ + + # Get process messages + out, err = process.communicate() + + # Print output + def to_str(str_or_bytestr): + """If argument is of type str, return argument. If + argument is of type bytes, return decoded str""" + if isinstance(str_or_bytestr, str): + return str_or_bytestr + elif isinstance(str_or_bytestr, bytes): + return str(str_or_bytestr, 'utf-8') + else: + msg = "str_or_bytestr is " + str(type(str_or_bytestr)) + \ + ", should be str or bytestr." + raise TypeError(msg) + + logging.debug("Subprocess output:") + if out is None: + logging.debug("No output.") + elif isinstance(out, list): + for line in out: + logging.debug(to_str(line.rstrip())) + else: + logging.debug(to_str(out.rstrip())) + + # Handle errors + if process.returncode != 0: + logging.error("subprocess error") + logging.error("Returncode: %s", str(process.returncode)) + logging.error("Message, if any:") + if not err is None: + for line in err: + logging.error(line.rstrip()) + + return process.returncode + + @classmethod + def get_index_groups(cls, *args): + """ + Input: + numpy arrays with 1 dimension or lists, all same length + Output: + Dictionary of lists of indices that have the same + combination of input values. + """ + + try: + # If pandas is available, it makes a pandas DataFrame and + # uses its groupby-function. + import pandas as pd + + args_array = np.array(args).transpose() + df = pd.DataFrame(args_array) + groups = df.groupby(list(range(len(args)))).indices + + except ImportError: + # If pandas is not available, use an own implementation of groupby. + # Recursive implementation. It's fast. + args_array = np.array(args).transpose() + groups = cls._group(args_array) + + return groups + + @classmethod + def _group(cls, a): + """ + Reimplementation of pandas.DataFrame.groupby.indices because + py 2.7 on cartesius isn't compatible with pandas. + Unlike the pandas function, this always uses all columns of the + input array. + + Parameters + ---------- + a : numpy.ndarray (2D) + Array of indices. Each row is a combination of indices. + + Returns + ------- + groups : dict + The keys are the unique combinations of indices (rows of a), + the values are the indices of the rows of a equal the key. + """ + + # This is a recursive function: It makes groups according to the + # first columnm, then calls itself with the remaining columns. + # Some index juggling. + + # Group according to first column + UI = list(set(a[:, 0])) + groups0 = dict() + for ui in UI: + # Key must be a tuple + groups0[(ui, )] = [i for i, x in enumerate(a[:, 0]) if x == ui] + + if a.shape[1] == 1: + # If the array only has one column, we're done + return groups0 + else: + # If the array has more than one column, we group those. + groups = dict() + for ui in UI: + # Group according to the remaining columns + subgroups_ui = cls._group(a[groups0[(ui, )], 1:]) + # Now the index juggling: Add the keys together and + # locate values in the original array. + for key in list(subgroups_ui.keys()): + # Get indices of bigger array + subgroups_ui[key] = [groups0[(ui, )][n] for n in subgroups_ui[key]] + # Add the keys together + groups[(ui, ) + key] = subgroups_ui[key] + + return groups + + + @staticmethod + def apply_by_group(func, array, groups, grouped_args=None, *args, **kwargs): + """ + Apply function 'func' to a numpy array by groups of indices. + 'groups' can be a list of lists or a dictionary with lists as + values. + + If 'array' has more than 1 dimension, the indices in 'groups' + are for the first axis. + + If 'grouped_args' is not None, its members are added to + 'kwargs' after slicing. + + *args and **kwargs are passed through to 'func'. + + Example: + apply_by_group(np.mean, np.array([0., 1., 2.]), [[0, 1], [2]]) + Output: + array([0.5, 2. ]) + """ + + shape_in = array.shape + shape_out = list(shape_in) + shape_out[0] = len(groups) + array_out = np.ndarray(shape_out, dtype=array.dtype) + + if type(groups) == list: + # Make a dictionary + groups = {n: groups[n] for n in range(len(groups))} + + if not grouped_args is None: + kwargs0 = copy.deepcopy(kwargs) + for n in range(len(groups)): + k = list(groups.keys())[n] + + # Add additional arguments that need to be grouped to kwargs + if not grouped_args is None: + kwargs = copy.deepcopy(kwargs0) + for ka, v in grouped_args.items(): + kwargs[ka] = v[groups[k], ...] + + array_out[n, ...] = np.apply_along_axis(func, 0, array[groups[k], ...], *args, **kwargs) + + return array_out + diff --git a/da/tools/wrfchem/wrfchem_helper.py b/da/tools/wrfchem/wrfchem_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..b4b399294aaeb2a0e2f8418890d674fa12cb45d1 --- /dev/null +++ b/da/tools/wrfchem/wrfchem_helper.py @@ -0,0 +1,990 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 22 15:03:02 2019 + +This class contains helper functions mainly for sampling WRF-Chem, but a +few are also needed by the WRF-Chem column observation operator. + +@author: friedemann +""" + +# Instructions for pylint: +# pylint: disable=too-many-instance-attributes +# pylint: disable=W0201 +# pylint: disable=C0301 +# pylint: disable=E1136 +# pylint: disable=E1101 + + +import os +import shutil +import re +import glob +import bisect +import copy +import numpy as np +import netCDF4 as nc +import datetime as dt +import wrf +import f90nml +import pickle + +# CTDAS modules +import da.tools.io4 as io +from da.tools.wrfchem.utilities import utilities + + +class WRFChemHelper(object): + """Contains helper functions for sampling WRF-Chem""" + def __init__(self, settings): + self.settings = settings + + def validate_settings(self, needed_items=[]): + """ + This is based on WRFChemOO._validate_rc + """ + + if len(needed_items)==0: + return + + for key in needed_items: + if key not in self.settings: + msg = "Missing a required value in settings: %s" % key + raise IOError(msg) + + @staticmethod + def read_namelist(dirname): + """Read run settings from namelist.input file in dirname""" + nml_file = os.path.join(dirname, "namelist.input") + namelist = f90nml.read(nml_file) + # Some values are lists when running with nested domains, but + # not when running with only one domain. Ensure here those + # values are always lists. + list_vars = ["e_we", + "e_sn", + "parent_id", + "parent_grid_ratio", + "i_parent_start", + "j_parent_start" + ] + for v in list_vars: + if not isinstance(namelist["domains"][v], list): + namelist["domains"][v] = [namelist["domains"][v]] + + return namelist + + @staticmethod + def get_pressure_boundaries_paxis(p_axis, p_surf): + """ + Arguments + --------- + p_axis (:class:`array-like`) + Pressure at mid points of layers + p_surf (:class:`numeric`) + Surface pressure + Output + ------ + Pressure at layer boundaries + """ + + pb = np.array([float("nan")]*(len(p_axis)+1)) + pb[0] = p_surf + + for nl in range(len(pb)-1): + pb[nl+1] = pb[nl] + 2*(p_axis[nl] - pb[nl]) + + return pb + + @staticmethod + def get_pressure_boundaries_znw(znw, p_surf, p_top): + """ + Arguments + --------- + ZNW (:class:`ndarray`) + Eta coordinates of z-staggered WRF grid. For each + observation (2D) + p_surf (:class:`ndarray`) + Surface pressure (1D) + p_top (:class:`ndarray`) + Top-of-atmosphere pressure (1D) + Output + ------ + Pressure at layer boundaries + + CAVEATS + ------- + Maybe I should rather use P_HYD? Well, Butler et al. 2018 + (https://www.geosci-model-dev-discuss.net/gmd-2018-342/) used + znu and surface pressure to compute "WRF midpoint layer + pressure". + + For WRF it would be more consistent to interpolate to levels. + See also comments in code. + """ + + return znw*(p_surf-p_top) + p_top + + + @staticmethod + def get_int_coefs(pb_ret, pb_mod, level_def): + """ + Computes a coefficients matrix to transfer a model profile onto + a retrieval pressure axis. + + If level_def=="layer_average", this assumes that profiles are + constant in each layer of the retrieval, bound by the pressure + boundaries pb_ret. In this case, the WRF model layer is treated + in the same way, and coefficients integrate over the assumed + constant model layers. This works with non-staggered WRF + variables (on "theta" points). However, this is actually not how + WRF is defined, and the implementation should be changed to + z-staggered variables. Details for this change are in a comment + at the beginning of the code. + + If level_def=="pressure_boundary" (IMPLEMENTATION IN PROGRESS), + assumes that profiles, kernel and pwf are defined at pressure + boundaries that don't have a thickness (this is how OCO-2 data + are defined, for example). In this case, the coefficients + linearly interpolate adjacent model level points. This is + incompatible with the treatment of WRF in the above-described + layer-average assumption, but is closer to how WRF is actually + defined. The exception is that pb_mod is still constructed and + non-staggered variables are not defined at psurf. This can only + be fixed by switching to z-staggered variables. + + In cases where retrieval surface pressure is higher than model + surface pressure, and in cases where retrieval top pressure is + lower than model top pressure, the model profile will be + extrapolated with constant tracer mixing ratios. In cases where + retrieval surface pressure is lower than model surface pressure, + and in cases where retrieval top pressure is higher than model + top pressure, only the parts of the model column that fall + within the retrieval presure boundaries are sampled. + + Arguments + --------- + pb_ret (:class:`array_like`) + Pressure boundaries of the retrieval column + pb_mod (:class:`array_like`) + Pressure boundaries of the model column + level_def (:class:`string`) + "layer_average" or "pressure_boundary" (IMPLEMENTATION IN + PROGRESS). Refers to the retrieval profile. + + Note 2021-09-13: Inspected code for pressure_boundary. + Should be correct. Interpolates linearly between two model + levels. + + + Returns + ------- + coefs (:class:`array_like`) + Integration coefficient matrix. Each row sums to 1. + + Usage + ----- + .. code-block:: python + + import numpy as np + pb_ret = np.linspace(900., 50., 5) + pb_mod = np.linspace(1013., 50., 7) + model_profile = 1. - np.linspace(0., 1., len(pb_mod)-1)**3 + coefs = get_int_coefs(pb_ret, pb_mod, "layer_average") + retrieval_profile = np.matmul(coefs, model_profile) + """ + + if level_def == "layer_average": + # This code assumes that WRF variables are constant in + # layers, but they are defined on levels. This can be seen + # for example by asking wrf.interplevel for the value of a + # variable that is defined on the mass grid ("theta points") + # at a pressure slightly higher than the pressure on its + # grid (wrf.getvar(ncf, "p")), it returns nan. So There is + # no extrapolation. There are no layers. There are only + # levels. + # In addition, this page here: + # https://www.openwfm.org/wiki/How_to_interpret_WRF_variables + # says that to find values at theta-points of a variable + # living on u-points, you interpolate linearly. That's the + # other way around from what I would do if I want to go from + # theta to staggered. + # WRF4.0 user guide: + # - ungrib can interpolate linearly in p or log p + # - real.exe comes with an extrap_type namelist option, that + # extrapolates constantly BELOW GROUND. + # This would mean the correct way would be to integrate over + # a piecewise-linear function. It also means that I really + # want the value at surface level, so I'd need the CO2 + # fields on the Z-staggered grid ("w-points")! Interpolate + # the vertical in p with wrf.interp1d, example: + # wrf.interp1d(np.array(rh.isel(south_north=1, west_east=0)), + # np.array(p.isel(south_north=1, west_east=0)), + # np.array(988, 970)) + # (wrf.interp1d gives the same results as wrf.interplevel, + # but the latter just doesn't want to work with single + # columns (32,1,1), it wants a dim>1 in the horizontal + # directions) + # So basically, I can keep using pb_ret and pb_mod, but it + # would be more accurate to do the piecewise-linear + # interpolation and the output matrix will have 1 more + # value in each dimension. + + # Calculate integration weights by weighting with layer + # thickness. This assumes that both axes are ordered + # psurf to ptop. + coefs = np.ndarray(shape=(len(pb_ret)-1, len(pb_mod)-1)) + coefs[:] = 0. + + # Extend the model pressure grid if retrieval encompasses + # more. + pb_mod_tmp = copy.deepcopy(pb_mod) + + # In case the retrieval pressure is higher than the model + # surface pressure, extend the lowest model layer. + if pb_mod_tmp[0] < pb_ret[0]: + pb_mod_tmp[0] = pb_ret[0] + + # In case the model doesn't extend as far as the retrieval, + # extend the upper model layer upwards. + if pb_mod_tmp[-1] > pb_ret[-1]: + pb_mod_tmp[-1] = pb_ret[-1] + + # For each retrieval layer, this loop computes which + # proportion falls into each model layer. + for nret in range(len(pb_ret)-1): + + # 1st model pressure boundary index = the one before the + # first boundary with lower pressure than high-pressure + # retrieval layer boundary. + model_lower = pb_mod_tmp < pb_ret[nret] + id_model_lower = model_lower.nonzero()[0] + id_min = id_model_lower[0]-1 + + # Last model pressure boundary index = the last one with + # higher pressure than low-pressure retrieval layer + # boundary. + model_higher = pb_mod_tmp > pb_ret[nret+1] + + id_model_higher = model_higher.nonzero()[0] + + if len(id_model_higher) == 0: + #id_max = id_min + raise ValueError("This shouldn't happen. Debug.") + else: + id_max = id_model_higher[-1] + + # By the way, in case there is no model level with + # higher pressure than the next retrieval level, + # id_max must be the same as id_min. + + # For each model layer, find out how much of it makes up this + # retrieval layer + for nmod in range(id_min, id_max+1): + if (nmod == id_min) & (nmod != id_max): + # Part of 1st model layer that falls within + # retrieval layer + coefs[nret, nmod] = pb_ret[nret] - pb_mod_tmp[nmod+1] + elif (nmod != id_min) & (nmod == id_max): + # Part of last model layer that falls within + # retrieval layer + coefs[nret, nmod] = pb_mod_tmp[nmod] - pb_ret[nret+1] + elif (nmod == id_min) & (nmod == id_max): + # id_min = id_max, i.e. model layer encompasses + # retrieval layer + coefs[nret, nmod] = pb_ret[nret] - pb_ret[nret+1] + else: + # Retrieval layer encompasses model layer + coefs[nret, nmod] = pb_mod_tmp[nmod] - pb_mod_tmp[nmod+1] + + coefs[nret, :] = coefs[nret, :]/sum(coefs[nret, :]) + + # I tested the code with many cases, but I'm only 99.9% sure + # it works for all input. Hence a test here that the + # coefficients sum to 1 and dump the data if not. + sum_ = np.abs(coefs.sum(1) - 1) + if np.any(sum_ > 2.*np.finfo(sum_.dtype).eps): + dump = dict(pb_ret=pb_ret, + pb_mod=pb_mod, + level_def=level_def) + fp = "int_coefs_dump.pkl" + with open(fp, "w") as f: + pickle.dump(dump, f, 0) + + msg_fmt = "Something doesn't sum to 1. Arguments dumped to: %s" + raise ValueError(msg_fmt % fp) + + elif level_def=="pressure_boundary": + #msg = "level_def is pressure_boundary. Implementation not complete." + ##logging.error(msg) + #raise ValueError(msg) + # Note 2021-09-13: Inspected the code. Should be correct. + + # Go back to pressure midpoints for model... + # Change this line to p_mod = pb_mod for z-staggered + # variables + p_mod = pb_mod[1:] - 0.5*np.diff(pb_mod) + + coefs = np.ndarray(shape=(len(pb_ret), len(pb_mod)-1)) + coefs[:] = 0. + + # For each retrieval pressure level, compute linear + # interpolation coefficients + for nret in range(len(pb_ret)): + nmod_list = (p_mod < pb_ret[nret]).nonzero()[0] + if(len(nmod_list)>0): + nmod = nmod_list[0] - 1 + if nmod==-1: + # Constant extrapolation at surface + nmod = 0 + coef = 1. + else: + # Normal case: + coef = (pb_ret[nret]-p_mod[nmod+1])/(p_mod[nmod]-p_mod[nmod+1]) + else: + # Constant extrapolation at atmosphere top + nmod = len(p_mod)-2 + coef=0. + + coefs[nret, nmod] = coef + coefs[nret, nmod+1] = 1.-coef + + else: + msg = "Unknown level_def: " + level_def + raise ValueError(msg) + + return coefs + + @staticmethod + def get_pressure_weighting_function(pressure_boundaries, rule): + """ + Compute pressure weighting function according to 'rule'. + Valid rules are: + - simple (=layer thickness) + - connor2008 (not implemented) + """ + if rule == 'simple': + pwf = np.abs(np.diff(pressure_boundaries)/np.ptp(pressure_boundaries)) + else: + raise NotImplementedError("Rule %s not implemented" % rule) + + return pwf + + def locate_domain(self, lat, lon): + """ + Input + ----- + lat: Single values or lists or np.arrays + lon: Single values or lists or np.arrays + + Output + ------ + - xy-coordinates in finest domain that contains the coordinates + - finest domain + + NOTE + ---- + self.open_wrf_location_files() must be run first + """ + + if not hasattr(self, "_loc_files"): + raise RuntimeError("Must call open_wrf_loc_files first.") + + # Work with arrays internally. + lat = np.array(lat, ndmin=1) + lon = np.array(lon, ndmin=1) + + # Get coordinates of the observation in all domains + ndomains = self.namelist["domains"]["max_dom"] + + # Get domain sizes in xy on mass (=unstaggered) grid (hence the -1) + dom_size_x = np.array(self.namelist['domains']['e_we'], dtype=float, ndmin=1) - 1 + dom_size_y = np.array(self.namelist['domains']['e_sn'], dtype=float, ndmin=1) - 1 + + # Initialize output + xy = np.zeros((len(lat), 2)) + xy[:] = np.nan + finest_domain = np.zeros((len(lat), ), int) + + # Since wrf.ll_to_xy takes very long, I save a bit of time here + # by starting in the finest domain and processing only + # observations that weren't previously found. + for n in range(ndomains-1, -1, -1): + # Get xy for this domain + # Only process what you haven't processed + sel = np.where(finest_domain == 0)[0] + + # In case all domains where set + if len(sel)==0: + break + + x, y = wrf.ll_to_xy(wrfin=self._loc_files[n], + latitude=lat[sel], + longitude=lon[sel], + meta=False, + as_int=False) + + + # For each domain, check if the observation is inside the + # domain extent + # I put the edges on -0.5 and e_we/e_sn - 0.5. + # Reason is that wrf.ll_to_xy(..., as_int = True) returns + # -0.2 as 0, so it's inside the domain. + # Also, for e_we=99, e_sn = 92, ref_lon=9, ref_lat=51.5: + # wrf.ll_to_xy(wrfin=self._loc_files[0], latitude=51.5, longitude=9., meta=False, as_int=False) + # array([48.49996124, 45.00003737]) + # This means that int indices denote exactly the mass-staggered point. + + # To be able to iterate over x and y in case they're scalars: + x = np.array(x, ndmin=1) + y = np.array(y, ndmin=1) + + # Test: inside domain? + # The -1 here are because of 0-based indices, and the -/+ 0.5 are + # because that's halfway to the next mass-staggered point and + # I treat the WRF grid as boxes. + x_in = [(-0.5 <= x_) and (x_ <= dom_size_x[n] - 1 + 0.5) for x_ in x] + y_in = [(-0.5 <= y_) and (y_ <= dom_size_y[n] - 1 + 0.5) for y_ in y] + + # Save domain, x and y at these locations + in_this_dom = np.where(np.logical_and(x_in, y_in))[0] + finest_domain[sel[in_this_dom]] = n + 1 + # If domain = 1, set _all_ xy to see where they end up + if n==0: + xy[sel, 0] = x + xy[sel, 1] = y + else: + xy[sel[in_this_dom], 0] = x[in_this_dom] + xy[sel[in_this_dom], 1] = y[in_this_dom] + + + # Return the indices and domain + return xy, finest_domain + + def get_groups_space_time(self, dat, time_bins, only_in=False): + """ + Returns a dictionary of lists of indices of observations in dat, + where keys are tuples of (time bin, x bin, y bin and wrf + domain), and values are indices of the observations that fall + within this bin. + + Arguments + --------- + + dat : list + List containing measurement locations: + date (same type as time_bins), latitude, longitude + time_bins : list + List of boundaries of time bins (I use datetime.datetime + objects) + only_in : boolean + If only_in is True, groups that fall outside the time_bins + and domain are not returned. + + """ + + # Time groups + id_t = np.array([bisect.bisect_right(time_bins, dat_date) + for dat_date in dat['date']], + int) + + # Spatial groups (indices and domain) + id_xy_f, dom = self.locate_domain(dat['latitude'], dat['longitude']) + + id_xy = np.round(id_xy_f).astype(int) + + + + # Version of only_in with sel: might be faster if sel is short - + # I don't know! But the results for my test case where identical + # (meaning 'domain' looked correct and identical) for both + # options for only_in. + + # if only_in: + # # Yes, 0<id_t is correct: in bisect_right, it means the + # # value is below the lowest sequence value. + # time_in = np.logical_and(0<id_t, id_t<len(time_bins)) + # space_in = dom != 0 + # sel = np.where(np.logical_and(time_in, space_in))[0] + # + # else: + # sel = range(len(id_t)) + # + # indices = get_index_groups(id_t[sel], id_xy[sel, 0], id_xy[sel, 1], dom[sel]) + # + # # Now have to account for sel again! + # if only_in: + # for k, v in indices.iteritems(): + # indices[k] = sel[v] + + # Version of only_in without sel: might be faster if sel is + # long - I don't know! But the results for my test case where + # identical (meaning 'domain' looked correct and identical) + + # Indices for all groups: + indices = utilities.get_index_groups(id_t, id_xy[:, 0], id_xy[:, 1], dom) + + # Throw the out-of-domain ones out here already + if only_in: + # Remove the index groups where domain is 0 (= outside of + # domain). Need to iterate over a list, because with an + # iterator, python complains that the dictionary changed + # size during iteration. + for k in list(indices.keys()): + if k[3] == 0: + del indices[k] + # Equivalent to the above 3 lines, don't know what's faster: + # groups = {k: v for k, v in groups.iteritems() if k[3] != 0} + + return indices + + + @staticmethod + def times_in_wrf_file(ncf): + """ + Returns the times in netCDF4.Dataset ncf as datetime object + """ + times_nc = ncf.variables["Times"] + times_chr = [] + for nt in range(times_nc.shape[0]): + # freum 2021-07-11: with the migration to python3, need to + # replace the string - conversion. Hope "utf-8" works + # always. + # times_chr.append(times_nc[nt, :].tostring()) + times_chr.append(str(times_nc[nt, :], "utf-8")) + + times_dtm = [dt.datetime.strptime(t_chr, "%Y-%m-%d_%H:%M:%S") + for t_chr in times_chr] + + return times_dtm + + def wrf_times(self, file_list): + """Read all times in a list of wrf files + + Output + ------ + - 1D-array containing all times + - 1D-array containing start indices of each file + """ + + times = list() + start_indices = np.ndarray((len(file_list), ), int) + for nf in range(len(file_list)): + ncf = nc.Dataset(file_list[nf]) + times_this = self.times_in_wrf_file(ncf) + start_indices[nf] = len(times) + times += times_this + ncf.close() + + return times, start_indices + + def open_wrf_location_files(self): + """ + Keep a description of a small wrf file for each domain in memory + to locate observations. + Appends _loc_file to self. + + Note: Should be edited out of the code. + """ + + ndomains = self.namelist["domains"]["max_dom"] + path = self.settings["run_dir"] + pattern = "wrfinput_d%02d" + self._loc_files = list() + for nd in range(1, ndomains+1): + fp = os.path.join(path, pattern % nd) + self._loc_files.append(nc.Dataset(fp, "r")) + + def close_wrf_location_files(self): + """See _open_wrf_location_files""" + for loc_file in self._loc_files: + loc_file.close() + + def wrf_filename_times(self, prefix): + """Get timestamps in wrf file names in run directory.""" + + # List all filenames + files = self.get_wrf_filenames(prefix + "*") + # Only use d01 files, pattern should be the same for all domains + pattern = os.path.join(self.settings["run_dir"], prefix) + files = [f for f in files if re.search(pattern, f)] + # Extract timestamp from filename + # Format is %Y-%m-%d_%H:%M:%S at the end of the filename + pattern_time = "%Y-%m-%d_%H:%M:%S" + len_tstamp = len(pattern_time) + 2 + times = [dt.datetime.strptime(f[-len_tstamp:], pattern_time) + for f in files] + + return times + + def get_wrf_filenames(self, glob_pattern): + """ + Gets the filenames in self.settings["run_dir"] that follow + glob_pattern, excluding those that end with + self.settings["original_save_suffix"] + """ + path = self.settings["run_dir"] + # All files... + wfiles = glob.glob(os.path.join(path, glob_pattern)) + # All originals + orig_suf = self.settings["original_save_suffix"] + opattern = glob_pattern + orig_suf + ofiles = glob.glob(os.path.join(path, opattern)) + + # All files except all originals: + files = [x for x in wfiles if x not in ofiles] + + # I need this sorted too often to not do it here. + files = np.sort(files).tolist() + return files + + + def sample_total_columns(self, dat, loc, fields_list): + """ + Sample total_columns of fields_list in WRF output in + self.settings["run_dir"] at the location id_xy in domain, id_t + in all wrfout-times. Files and indices therein are recognized + by id_t and file_time_start_indices. + All quantities needed for computing total columns from profiles + are in dat (kernel, prior, ...). + + Arguments + --------- + dat (:class:`list`) + Result of wrfhelper.read_sampling_coords. Used here: prior, + prior_profile, kernel, psurf, pressure_axis, [, pwf] + If psurf or any of pressure_axis are nan, wrf's own + surface pressure is used and pressure_axis constructed + from this and the number of levels in the averaging kernel. + This allows sampling with synthetic data that don't have + pressure information. This only works with level_def + "layer_average". + If pwf is not present or nan, a simple one is created, for + level_def "layer_average". + loc (:class:`dict`) + A dictionary with all location-related input for sampling, + computed in wrfout_sampler. Keys: + id_xy, domain: Domain coordinates + id_t: Timestep (continous throughout all files) + frac_t: Interpolation coeficient between id_t and id_t+1: + t_obs = frac_t*t[id_t] + (1-frac_t)*t[id_t+1]) + file_start_time_indices: Time index at which a new wrfout + file starts + files: names of wrfout files. + fields_list (:class:`list`) + The fields to sample total columns from. + + Output + ------ + sampled_columns (:class:`array`) + A 2D-array of sampled columns. + Shape: (len(dat["prior"]), len(fields_list)) + """ + + # Initialize output + tc = np.ndarray(shape=(len(dat["prior"]), len(fields_list)), dtype=float) + tc[:] = float("nan") + + # Process by domain + UD = list(set(loc["domain"])) + for dom in UD: + idd = np.nonzero(loc["domain"] == dom)[0] + # Process by id_t + UT = list(set(loc["id_t"][idd])) + for time_id in UT: + # Coordinates to process + idt = idd[np.nonzero(loc["id_t"][idd] == time_id)[0]] + # Get tracer ensemble profiles + profiles = self._read_and_intrp_v(loc, fields_list, time_id, idt) + # List, len=len(fields_list), shape of each: (len(idt),nz) + # Get pressure axis: + #paxis = self.read_and_intrp(wh_names, id_ts, frac_t, id_xy, "P_HYD")/1e2 # Pa -> hPa + psurf = self._read_and_intrp_v(loc, ["PSFC"], time_id, idt)[0]/1.e2 # Pa -> hPa + # Shape: (len(idt),) + ptop = float(self.namelist["domains"]["p_top_requested"])/1.e2 + # Shape: (len(idt),) + znw = self._read_and_intrp_v(loc, ["ZNW"], time_id, idt)[0] + #Shape:(len(idt),nz) + + # DONE reading from file. + # Here it starts to make sense to loop over individual observations + for nidt in range(len(idt)): + nobs = idt[nidt] + # Construct model pressure layer boundaries + pb_mod = self.get_pressure_boundaries_znw(znw[nidt, :], psurf[nidt], ptop) + + if (np.diff(pb_mod) >= 0).any(): + msg = ("Model pressure boundaries for observation %d " + \ + "are not monotonically decreasing! Investigate.") % nobs + raise ValueError(msg) + + # Construct retrieval pressure layer boundaries + if dat["level_def"][nobs] == "layer_average": + if np.any(np.isnan(dat["pressure_levels"][nobs])) \ + or np.isnan(dat["psurf"][nobs]): + # Code for synthetic data without a pressure axis, + # but with an averaging kernel: + # Use wrf's surface and top pressure + nlayers = len(dat["averaging_kernel"][nobs]) + pb_ret = np.linspace(psurf[nidt], ptop, nlayers+1) + else: + pb_ret = self.get_pressure_boundaries_paxis( + dat["pressure_levels"][nobs], + dat["psurf"][nobs]) + elif dat["level_def"][nobs] == "pressure_boundary": + if np.any(np.isnan(dat["pressure_levels"][nobs])): + # Code for synthetic data without a pressure axis, + # but with an averaging kernel: + # Use wrf's surface and top pressure + nlevels = len(dat["averaging_kernel"][nobs]) + pb_ret = np.linspace(psurf[nidt], ptop, nlevels) + else: + pb_ret = dat["pressure_levels"][nobs] + + if (np.diff(pb_ret) >= 0).any(): + msg = ("Retrieval pressure boundaries for " + \ + "observation %d are not monotonically " + \ + "decreasing! Investigate.") % nobs + raise ValueError(msg) + + # Get vertical integration coefficients (i.e. to + # "interpolate" from model to retrieval grid) + coef_matrix = self.get_int_coefs(pb_ret, pb_mod, dat["level_def"][nobs]) + + # Model retrieval with averaging kernel and prior profile + if "pressure_weighting_function" in list(dat.keys()): + pwf = dat["pressure_weighting_function"][nobs] + if (not "pressure_weighting_function" in list(dat.keys())) or np.any(np.isnan(pwf)): + # Construct pressure weighting function from + # pressure boundaries + pwf = self.get_pressure_weighting_function(pb_ret, rule="simple") + + # Compute pressure-weighted averaging kernel + avpw = pwf*dat["averaging_kernel"][nobs] + + # Get prior + prior_col = dat["prior"][nobs] + prior_profile = dat["prior_profile"][nobs] + if np.isnan(prior_col): # compute prior + prior_col = np.dot(pwf, prior_profile) + + # Compute total columns + for nf in range(len(fields_list)): + # Integrate model profile + profile_intrp = np.matmul(coef_matrix, profiles[nf][nidt, :]) + + # Model retrieval + tc[nobs, nf] = prior_col + np.dot(avpw, profile_intrp - prior_profile) + + # Test phase: save pb_ret, pb_mod, coef_matrix, + # one profile for manual checking + + # dat_save = dict(pb_ret=pb_ret, + # pb_mod=pb_mod, + # coef_matrix=coef_matrix, + # ens_profile=ens_profiles[0], + # profile_intrp=profile_intrp, + # id=dat.id) + # + #out = open("model_profile_%d.pkl"%dat.id, "w") + #cPickle.dump(dat_save, out, 0) + # Average over footprint + if self.settings["footprint_samples_dim"] > 1: + indices = utilities.get_index_groups(dat["sounding_id"]) + + # Make sure that this is correct: i know the number of indices + lens = [len(group) for group in list(indices.values())] + correct_len = self.settings["footprint_samples_dim"]**2 + if np.any([len_ != correct_len for len_ in set(lens)]): + raise ValueError("Not all footprints have %d samples" %correct_len) + # Ok, paranoid mode, also confirm that the indices are what I + # think they are: consecutive numbers + ranges = [np.ptp(group) for group in list(indices.values())] + if np.any([ptp != correct_len for ptp in set(ranges)]): + raise ValueError("Not all footprints have consecutive samples") + + tc_original = copy.deepcopy(tc) + tc = utilities.apply_by_group(np.average, tc_original, indices) + + return tc + + @staticmethod + def _read_and_intrp_v(loc, fields_list, time_id, idp): + """ + Helper function for sample_total_columns. + read_and_intrp, but vectorized. + Reads in fields and interpolates + them linearly in time. + + Arguments + ---------- + loc (:class:`dict`) + Passed through from sample_total_columns, see there. + fields_list (:class:`list` of :class:`str`) + List of netcdf-variables to process. + time_id (:class:`int`) + Time index referring to all files in loc to read + idp (:class:`array` of :class:`int`) + Indices for id_xy, domain and frac_t in loc (i.e. + observations) to process. + + Output + ------ + List of temporally interpolated fields, one entry per member of + fields_list. + """ + + var_intrp_l = list() + + # Check we were really called with observations for just one domain + domains = set(loc["domain"][idp]) + if len(domains) > 1: + raise ValueError("I can only operate on idp with identical domains.") + dom = domains.pop() + + # Select input files + id_file0 = bisect.bisect_right(loc["file_start_time_indices"][dom], time_id) - 1 + id_file1 = bisect.bisect_right(loc["file_start_time_indices"][dom], time_id+1) - 1 + if id_file0 < 0 or id_file1 < 0: + raise ValueError("This shouldn't happen.") + + # Get time id in file + id_t_file0 = time_id - loc["file_start_time_indices"][dom][id_file0] + id_t_file1 = time_id+1 - loc["file_start_time_indices"][dom][id_file1] + + # Open files + nc0 = nc.Dataset(loc["files"][dom][id_file0], "r") + nc1 = nc.Dataset(loc["files"][dom][id_file1], "r") + # Per field to sample + for field in fields_list: + # Read input file + field0 = wrf.getvar(wrfin=nc0, + varname=field, + timeidx=id_t_file0, + squeeze=False, + meta=False) + + field1 = wrf.getvar(wrfin=nc1, + varname=field, + timeidx=id_t_file1, + squeeze=False, + meta=False) + + if len(field0.shape) == 4: + # Sample field at timesteps before and after observation + # They are ordered nt x nz x ny x nx + # var0 will have shape (len(idp),len(profile)) + var0 = field0[0, :, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]] + var1 = field1[0, :, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]] + # Repeat frac_t for profile size + frac_t_ = np.array(loc["frac_t"][idp]).reshape((len(idp), 1)).repeat(var0.shape[1], 1) + elif len(field0.shape) == 3: + # var0 will have shape (len(idp),) + var0 = field0[0, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]] + var1 = field1[0, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]] + frac_t_ = np.array(loc["frac_t"][idp]) + elif len(field0.shape) == 2: + # var0 will have shape (len(idp),len(profile)) + # This is for ZNW, which is saved as (time_coordinate, + # vertical_coordinate) + var0 = field0[[0]*len(idp), :] + var1 = field1[[0]*len(idp), :] + frac_t_ = np.array(loc["frac_t"][idp]).reshape((len(idp), 1)).repeat(var0.shape[1], 1) + else: + raise ValueError("Can't deal with field with %d dimensions." % len(field0.shape)) + + # Interpolate in time + var_intrp_l.append(var0*frac_t_ + var1*(1. - frac_t_)) + + nc0.close() + nc1.close() + + return var_intrp_l + + @staticmethod + def read_sampling_coords(sampling_coords_file, id0=None, id1=None): + """Read in samples""" + + ncf = nc.Dataset(sampling_coords_file, "r") + if id0 is None: + id0 = 0 + if id1 is None: + id1 = len(ncf.dimensions['soundings']) + + dat = dict( + sounding_id=np.array(ncf.variables["sounding_id"][id0:id1]), + date=ncf.variables["date"][id0:id1], + latitude=np.array(ncf.variables["latitude"][id0:id1]), + longitude=np.array(ncf.variables["longitude"][id0:id1]), + latc_0=np.array(ncf.variables["latc_0"][id0:id1]), + latc_1=np.array(ncf.variables["latc_1"][id0:id1]), + latc_2=np.array(ncf.variables["latc_2"][id0:id1]), + latc_3=np.array(ncf.variables["latc_3"][id0:id1]), + lonc_0=np.array(ncf.variables["lonc_0"][id0:id1]), + lonc_1=np.array(ncf.variables["lonc_1"][id0:id1]), + lonc_2=np.array(ncf.variables["lonc_2"][id0:id1]), + lonc_3=np.array(ncf.variables["lonc_3"][id0:id1]), + prior=np.array(ncf.variables["prior"][id0:id1]), + prior_profile=np.array(ncf.variables["prior_profile"][id0:id1,]), + averaging_kernel=np.array(ncf.variables["averaging_kernel"][id0:id1]), + pressure_levels=np.array(ncf.variables["pressure_levels"][id0:id1]), + pressure_weighting_function=np.array(ncf.variables["pressure_weighting_function"][id0:id1]), + level_def=ncf.variables["level_def"][id0:id1], + psurf=np.array(ncf.variables["psurf"][id0:id1]) + ) + + ncf.close() + + # Convert level_def from it's weird nc format to string + dat["level_def"] = nc.chartostring(dat["level_def"]) + + # Convert date to datetime object + dat["time"] = [dt.datetime(*x) for x in dat["date"]] + + return dat + + @staticmethod + def write_simulated_columns(obs_id, simulated, nmembers, outfile): + """Write simulated observations to file.""" + + # Output format: see obs_xco2_fr + + f = io.CT_CDF(outfile, method="create") + + dimid = f.createDimension("sounding_id", size=None) + dimid = ("sounding_id",) + savedict = io.std_savedict.copy() + savedict["name"] = "sounding_id" + savedict["dtype"] = "int64" + savedict["long_name"] = "Unique_Dataset_observation_index_number" + savedict["units"] = "" + savedict["dims"] = dimid + savedict["comment"] = "Format as in input" + savedict["values"] = obs_id.tolist() + f.add_data(savedict, nsets=0) + + dimmember = f.createDimension("nmembers", size=nmembers) + dimmember = ("nmembers",) + savedict = io.std_savedict.copy() + savedict["name"] = "column_modeled" + savedict["dtype"] = "float" + savedict["long_name"] = "Simulated total column" + savedict["units"] = "??" + savedict["dims"] = dimid + dimmember + savedict["comment"] = "Simulated model value created by WRFChemOO" + savedict["values"] = simulated.tolist() + f.add_data(savedict, nsets=0) + + f.close() + + @staticmethod + def save_file_with_timestamp(file_path, out_dir, suffix=""): + """ Saves a file to with a timestamp""" + nowstamp = dt.datetime.now().strftime("_%Y-%m-%d_%H:%M:%S") + new_name = os.path.basename(file_path) + suffix + nowstamp + new_path = os.path.join(out_dir, new_name) + shutil.copy2(file_path, new_path) + + +if __name__ == "__main__": + pass diff --git a/da/tools/wrfchem/wrfout_sampler.py b/da/tools/wrfchem/wrfout_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..03bc1fae0743e32259ab038fdbbe56f3b62d6218 --- /dev/null +++ b/da/tools/wrfchem/wrfout_sampler.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Created on Mon Jul 22 15:07:13 2019 + +@author: friedemann +""" + +# Samples wrfchem history files (wrfout*) for CTDAS + +# This is called as external executable from CTDAS +# to allow simple parallelization +# +# Usage: + +# wrfout_sampler.py --arg1 val1 --arg2 val2 ... +# Arguments: See parser in code below + +import os +import sys +#import itertools +#import bisect +import copy +import numpy as np +import netCDF4 as nc + +# Import some CTDAS tools +pd = os.path.pardir +inc_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + pd, pd, pd) +inc_path = os.path.abspath(inc_path) +sys.path.append(inc_path) +from da.tools.wrfchem.wrfchem_helper import WRFChemHelper +from da.tools.wrfchem.utilities import utilities +import argparse + +########## Parse options +parser = argparse.ArgumentParser() +parser.add_argument("--nproc", type=int, + help="ID of this sampling process (0 ... nprocs-1)") +parser.add_argument("--nprocs", type=int, + help="Number of sampling processes") +parser.add_argument("--sampling_coords_file", type=str, + help="File with sampling coordinates as created " + \ + "by CTDAS column samples object") +parser.add_argument("--run_dir", type=str, + help="Directory with wrfout files") +parser.add_argument("--original_save_suffix", type=str, + help="Just leave this on .original") +parser.add_argument("--nmembers", type=int, + help="Number of tracer ensemble members") +parser.add_argument("--tracer_optim", type=str, + help="Tracer that was optimized (e.g. CO2 for " + \ + "ensemble members CO2_000 etc.)") +parser.add_argument("--outfile_prefix", type=str, + help="One process: output file. More processes: " + \ + "output file is <outfile_prefix>.<nproc>.slice") +parser.add_argument("--footprint_samples_dim", type=int, + help="Sample column footprint at n x n points") + +args = parser.parse_args() +settings = copy.deepcopy(vars(args)) + +# Start (stupid) logging - should be updated +wd = os.getcwd() +try: + os.makedirs("log") +except OSError: # Case when directory already exists. Will look nicer in python 3... + pass + +logfile = os.path.join(wd, "log/wrfout_sampler." + str(settings['nproc']) + ".log") + +os.system("touch " + logfile) +os.system("rm " + logfile) +os.system("echo 'Process " + str(settings['nproc']) + " of " + str(settings['nprocs']) + ": start' >> " + logfile) +os.system("date >> " + logfile) + +########## Initialize wrfhelper +wrfhelper = WRFChemHelper(settings) +wrfhelper.validate_settings(['sampling_coords_file', + 'run_dir', + 'nproc', + 'nprocs', + 'original_save_suffix', # necessary for selecting filename + 'nmembers', # special case 0: sample 'tracer_optim' + 'tracer_optim', + 'outfile_prefix', + 'footprint_samples_dim']) + +cwd = os.getcwd() +os.chdir(wrfhelper.settings['run_dir']) + +#wrfhelper.namelist = wrfhelper.read_namelist(wrfhelper.settings['run_dir']) +wrfhelper.namelist = wrfhelper.read_namelist(".") + + +########## Figure out which samples to process +# Get number of samples +ncf = nc.Dataset(settings['sampling_coords_file'], "r") +nsamples = len(ncf.dimensions['soundings']) +ncf.close() + +id0, id1 = utilities.get_slicing_ids(nsamples, settings['nproc'], settings['nprocs']) + +os.system("echo 'id0=" + str(id0) + "' >> " + logfile) +os.system("echo 'id1=" + str(id1) + "' >> " + logfile) + +########## Read samples from coord file +dat = wrfhelper.read_sampling_coords(settings['sampling_coords_file'], id0, id1) + +os.system("echo 'Data read, len=" + str(len(dat['sounding_id'])) + "' >> " + logfile) + + +########## Locate samples in wrf domains + +# Take care of special case without ensemble +nmembers = settings['nmembers'] +if nmembers == 0: + # Special case: sample 'tracer_optim', don't add member suffix + member_names = [settings['tracer_optim']] + nmembers = 1 +else: + member_names = [settings['tracer_optim'] + "_%03d" % nm for nm in range(nmembers)] + + +# Keep a description of a small wrf file for each domain in memory to +# locate observations. +# This concept is probably obsolete - doesn't save time, and +# locate_domain is parallelized anyway +wrfhelper.open_wrf_location_files() + +if settings["footprint_samples_dim"]==1: + # Locate all observations in space + # This function Wouldn't work for moving nests. + id_xy_f, domain = wrfhelper.locate_domain(dat['latitude'], dat['longitude']) + # Assume box averages and don't interpolate horizontally + id_xy = np.round(id_xy_f).astype(int) +else: + # Return the whole thing (needed in wrfhelper.sample_total_columns) + raise NotImplementedError("To do: wrfhelper.get_footprint_sampling_points") + dat_fs = wrfhelper.get_footprint_sampling_points(dat) + # Locate (free) + id_xy_f_free, domain_free = wrfhelper.locate_domain(dat_fs['latitude'], dat_fs['longitude']) + id_xy_free = np.round(id_xy_f_free).astype(int) + # Determine max domain + domain_fs = None + raise NotImplementedError("To do: domain_fs") + # Sample again with domain restriction - no need to return it again + id_xy_f, _ = wrfhelper.locate_domain(dat_fs['latitude'], dat_fs['longitude'], domain_fs) + id_xy = np.round(id_xy_f).astype(int) + # Thin out domain_fs to pass it to determination of id_t and frac_t below + domain = domain_fs[::settings["footprint_samples_dim"]] + +wrfhelper.close_wrf_location_files() + +# Locate in time: Which file, time index, and temporal interpolation +# factor. +# MAYBE make this a function. See which quantities I need later. +id_t = np.zeros_like(domain) +frac_t = np.ndarray(id_t.shape, float) +frac_t[:] = float("nan") +# Add a little flexibility by doing this per domain - namelists allow +# different output frequencies per domain. +wrfout_files = dict() +wrfout_times = dict() +wrfout_start_time_ids = dict() + +UD = list(set(domain)) +for dom in UD: + os.system("echo 'Processing domain " + str(dom) + "' >> " + logfile) + idd = np.where(domain == dom)[0] + # Get full time vector + wrfout_files[dom] = wrfhelper.get_wrf_filenames("wrfout_d%02d_*00" % dom) + wrfout_times[dom], wrfout_start_time_ids[dom] = wrfhelper.wrf_times(wrfout_files[dom]) + + # time id + for idd_ in idd: + # Look where it sorts in + tmp = [i + for i in range(len(wrfout_times[dom])-1) + if wrfout_times[dom][i] <= dat['time'][idd_] \ + and dat['time'][idd_] < wrfout_times[dom][i+1]] + # Catch the case that the observation took place exactly at the + # last timestep + if len(tmp) == 1: + id_t[idd_] = tmp[0] + time0 = wrfout_times[dom][id_t[idd_]] + time1 = wrfout_times[dom][id_t[idd_]+1] + frac_t[idd_] = (time1 - dat['time'][idd_]).total_seconds() / (time1 - time0).total_seconds() + else: # len must be 0 in this case + if len(tmp) > 1: + os.system("echo 'wat' >> " + logfile) + raise ValueError("wat") + if dat['time'][idd_] == wrfout_times[dom][-1]: + id_t[idd_] = len(wrfout_times[dom])-1 + frac_t[idd_] = 1 + else: + msg = "Sample %d, sounding_id %s: outside of simulated time."%(idd_, dat['sounding_id'][idd_]) + os.system("echo '" + msg + "' >> " + logfile) + raise ValueError(msg) + + +# Now read the data +# Input: id_xy, dom, id_t, wrfout_start_time_ids, fract_t +# Output: sampled columns +# All input related to location: +if settings["footprint_samples_dim"]>1: + domain = domain_fs + id_t = np.repeat(id_t, settings["footprint_samples_dim"]) + frac_t = np.repeat(frac_t, settings["footprint_samples_dim"]) + +loc_input = dict(id_xy=id_xy, domain=domain, + id_t=id_t, frac_t=frac_t, + files=wrfout_files, file_start_time_indices=wrfout_start_time_ids) + +ens_sim = wrfhelper.sample_total_columns(dat, loc_input, member_names) + +# Write results to file +obs_ids = dat['sounding_id'] +# Remove simulations that are nan (=not in domain) +if ens_sim.shape[0] > 0: + valid = np.apply_along_axis(lambda arr: not np.any(np.isnan(arr)), 1, ens_sim) + obs_ids_write = obs_ids[valid] + ens_sim_write = ens_sim[valid, :] +else: + obs_ids_write = obs_ids + ens_sim_write = ens_sim + +if settings['nprocs'] == 1: + outfile = settings['outfile_prefix'] +else: + # Create output files with the appendix ".<nproc>.slice" + # Format <nproc> so that they can later be easily sorted. + len_nproc = int(np.floor(np.log10(settings['nprocs']))) + 1 + outfile = settings['outfile_prefix'] + (".%0" + str(len_nproc) + "d.slice") % settings['nproc'] + +os.system("echo 'Writing output file '" + os.path.join(wrfhelper.settings['run_dir'], outfile) + " >> " + logfile) + +wrfhelper.write_simulated_columns(obs_id=obs_ids_write, + simulated=ens_sim_write, + nmembers=nmembers, + outfile=outfile) + +os.chdir(cwd) + +os.system("echo 'Done' >> " + logfile)