diff --git a/da/observations/obs_column_wrfchem.py b/da/observations/obs_column_wrfchem.py
new file mode 100644
index 0000000000000000000000000000000000000000..38fea18ba1ea85e96bbc28dec10fde4b8bd9fa42
--- /dev/null
+++ b/da/observations/obs_column_wrfchem.py
@@ -0,0 +1,566 @@
+"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters.
+Users are recommended to contact the developers (wouter.peters@wur.nl) to receive
+updates of the code. See also: http://www.carbontracker.eu.
+
+This program is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software Foundation,
+version 3. This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with this
+program. If not, see <http://www.gnu.org/licenses/>."""
+#!/usr/bin/env python
+# obs.py
+
+"""
+Author : liesbeth, freum
+
+Revision History:
+File created June 2019.
+
+ On this version (freum):
+- Handles column observations. Based on obs_column_xco2.py from sometime mid 2019.
+- Since then, both versions have developed further. Since this also affects the file
+- format, I leave it as is for now for backwards compatibility for users of the
+- original python 2 code.
+
+Most important revisions in this file from original:
+- Added pressure weighting function
+- Added optional footprint corner coordinates to improve sampling of models with
+  resolutions finer than the satellite footprint. This functionality is not validated.
+
+Changes to (original) obs_column_xco2.py not yet implemented here as of 2021-07-10:
+- def __init__: New arguments h=0.0 (pressure weighting function), moderr=0.0, loc_L=0
+- Update to model-data-mismatch approach
+- Approach to localization implemented here
+- def add_observations(self, advance=False): new argument "advance"
+- dimlayers
+- self.additional_variables
+
+"""
+import os
+import sys
+import logging
+
+import datetime as dtm
+import numpy as np
+#from string import strip
+from numpy import array, logical_and, sqrt
+sys.path.append(os.getcwd())
+sys.path.append('../../')
+
+identifier = 'CarbonTracker total column-averaged mole fractions'
+version = '0.0'
+
+from da.observations.obs_baseclass import Observations
+import da.tools.io4 as io
+import da.tools.rc as rc
+
+
+################### Begin Class TotalColumnSample ###################
+
+class TotalColumnSample(object):
+    """
+        Holds the data that defines a total column sample in the data assimilation framework. Sor far, this includes all
+        attributes listed below in the __init__ method. One can additionally make more types of data, or make new
+        objects for specific projects.
+        This file may contain OCO-2 specific parts...
+    """
+
+    def __init__(self, idx, codex, xdate, obs=0.0, simulated=0.0, lat=-999., lon=-999., mdm=None, prior=0.0, prior_profile=0.0, av_kernel=0.0, pressure=0.0, \
+                ##### freum vvvv
+                pressure_weighting_function=None,
+                ##### freum ^^^^
+                level_def = "pressure_boundary", psurf = float('nan'), resid=0.0, hphr=0.0, flag=0, species='co2', sdev=0.0, \
+                ##### freum vvvv
+                latc_0=None, latc_1=None, latc_2=None, latc_3=None, lonc_0=None, lonc_1=None, lonc_2=None, lonc_3=None \
+                ##### freum ^^^^
+                ):
+        self.id             = idx               # Sounding ID
+        self.code           = codex             # Retrieval ID
+        self.xdate          = xdate             # Date of obs
+        self.obs            = obs               # Value observed
+        self.simulated      = simulated         # Value simulated by model, fillvalue = -9999
+        self.lat            = lat               # Sample lat
+        self.lon            = lon               # Sample lon
+        ##### freum vvvv
+        self.latc_0         = latc_0            # Sample latitude corner
+        self.latc_1         = latc_1            # Sample latitude corner
+        self.latc_2         = latc_2            # Sample latitude corner
+        self.latc_3         = latc_3            # Sample latitude corner
+        self.lonc_0         = lonc_0            # Sample longitude corner
+        self.lonc_1         = lonc_1            # Sample longitude corner
+        self.lonc_2         = lonc_2            # Sample longitude corner
+        self.lonc_3         = lonc_3            # Sample longitude corner
+        ##### freum ^^^^
+        self.mdm            = mdm               # Model data mismatch
+        self.prior          = prior             # A priori column value used in retrieval
+        self.prior_profile  = prior_profile     # A priori profile used in retrieval
+        self.av_kernel      = av_kernel         # Averaging kernel
+        self.pressure       = pressure          # Pressure levels of retrieval
+        # freum vvvv
+        self.pressure_weighting_function       = pressure_weighting_function          # Pressure weighting function
+        # freum ^^^^
+        self.level_def      = level_def         # Are prior and averaging kernel defined as layer averages?
+        self.psurf          = psurf             # Surface pressure (only needed if level_def is "layer_average")
+        self.loc_L          = int(0)            # freum 2021-07-13: insert this dummy value so the code runs with the current version of CTDAS. *Should* not affect results if localizetype == "CT2007" as in all my runs. However, replace this file with the standard observation file, obs_column_xco2.py
+
+        self.resid          = resid             # Mole fraction residuals
+        self.hphr           = hphr              # Mole fraction prior uncertainty from fluxes and (HPH) and model data mismatch (R)
+        self.may_localize   = True              # Whether sample may be localized in optimizer
+        self.may_reject     = True              # Whether sample may be rejected if outside threshold
+        self.flag           = flag              # Flag
+        self.sdev           = sdev              # standard deviation of ensemble
+        self.species        = species.strip()
+
+
+################### End Class TotalColumnSample ###################
+
+
+################### Begin Class TotalColumnObservations ###################
+
+class TotalColumnObservations(Observations):
+    """ An object that holds data + methods and attributes needed to manipulate column samples
+    """
+
+    def setup(self, dacycle):
+
+        self.startdate = dacycle['time.sample.start']
+        self.enddate   = dacycle['time.sample.end']
+
+        # Path to the input data (daily files)
+        sat_files   = dacycle.dasystem['obs.column.ncfile'].split(',')
+        sat_dirs    = dacycle.dasystem['obs.column.input.dir'].split(',')
+
+        self.sat_dirs    = []
+        self.sat_files   = []
+        for i in range(len(sat_dirs)):
+            if not os.path.exists(sat_dirs[i].strip()):
+                msg = 'Could not find the required satellite input directory (%s) ' % sat_dirs[i]
+                logging.error(msg)
+                raise IOError(msg)
+            else:
+                self.sat_dirs.append(sat_dirs[i].strip())
+                self.sat_files.append(sat_files[i].strip())
+        del i
+
+        # Get observation selection criteria (if present):
+        if 'obs.column.selection.variables' in dacycle.dasystem.keys() and 'obs.column.selection.criteria' in dacycle.dasystem.keys():
+            self.selection_vars     = dacycle.dasystem['obs.column.selection.variables'].split(',')
+            self.selection_criteria = dacycle.dasystem['obs.column.selection.criteria'].split(',')
+            logging.debug('Data selection criteria found: %s, %s' %(self.selection_vars, self.selection_criteria))
+        else:
+            self.selection_vars     = []
+            self.selection_criteria = []
+            logging.info('No data observation selection criteria found, using all observations in file.')
+
+        # Model data mismatch approach
+        self.mdm_calculation = dacycle.dasystem.get('mdm.calculation')
+        logging.debug('mdm.calculation = %s' %self.mdm_calculation)
+        if not self.mdm_calculation in ['parametrization','empirical','no_transport_error']:
+            logging.warning('No valid model data mismatch method found. Valid options are \'parametrization\' and \'empirical\'. ' + \
+                            'Using a constant estimate for the model uncertainty of 1ppm everywhere.')
+        else:
+            logging.info('Model data mismatch approach = %s' %self.mdm_calculation)
+
+        # Path to file with observation error settings for column observations
+        # Currently the same settings for all assimilated retrieval products: should this be one file per product?
+        if not os.path.exists(dacycle.dasystem['obs.column.rc']):
+            msg = 'Could not find the required column observation .rc input file (%s) ' % dacycle.dasystem['obs.column.rc']
+            logging.error(msg)
+            raise IOError(msg)
+        else:
+            self.obs_file = (dacycle.dasystem['obs.column.rc'])
+
+        self.datalist = []
+
+        # Switch to indicate whether simulated column samples are read from obsOperator output,
+        # or whether the sampling is done within CTDAS (in obsOperator class)
+        self.sample_in_ctdas = dacycle.dasystem['sample.in.ctdas'] if 'sample.in.ctdas' in dacycle.dasystem.keys() else False
+        logging.debug('sample.in.ctdas = %s' % self.sample_in_ctdas)
+
+
+
+    def get_samples_type(self):
+        return 'column'
+
+
+
+    def add_observations(self):
+        """ Reading of total column observations, and selection of observations that will be sampled and assimilated.
+
+        """
+
+        # Read observations from daily input files
+        for i in range(len(self.sat_dirs)):
+
+            logging.info('Reading observations from %s' %os.path.join(self.sat_dirs[i],self.sat_files[i]))
+
+            infile0 = os.path.join(self.sat_dirs[i], self.sat_files[i])
+            ndays = 0
+
+            while self.startdate+dtm.timedelta(days=ndays) <= self.enddate:
+
+                infile = infile0.replace("<YYYYMMDD>",(self.startdate+dtm.timedelta(days=ndays)).strftime("%Y%m%d"))
+
+                if os.path.exists(infile):
+
+                    logging.info("Reading observations for %s" % (self.startdate+dtm.timedelta(days=ndays)).strftime("%Y%m%d"))
+                    len_init = len(self.datalist)
+
+                    # get index of observations that satisfy selection criteria (based on variable names and values in system rc file, if present)
+                    ncf = io.ct_read(infile, 'read')
+                    if self.selection_vars:
+                        selvars = []
+                        for j in self.selection_vars:
+                            selvars.append(ncf.get_variable(j.strip()))
+                        del j
+                        criteria = []
+                        for j in range(len(self.selection_vars)):
+                            criteria.append(eval('selvars[j]'+self.selection_criteria[j]))
+                        del j
+                        #criteria  = [eval('selvars[i]'+self.selection_criteria[i]) for i in range(len(self.selection_vars))]
+                        subselect = np.logical_and.reduce(criteria).nonzero()[0]
+                    else:
+                        subselect = np.arange(ncf.get_variable('sounding_id').size)
+
+                    # retrieval attributes
+                    code      = ncf.get_attribute('retrieval_id')
+                    level_def = ncf.get_attribute('level_def')
+
+                    # only read good quality observations
+                    ids           = ncf.get_variable('sounding_id').take(subselect, axis=0)
+                    lats          = ncf.get_variable('latitude').take(subselect, axis=0)
+                    lons          = ncf.get_variable('longitude').take(subselect, axis=0)
+                    obs           = ncf.get_variable('obs').take(subselect, axis=0)
+                    unc           = ncf.get_variable('uncertainty').take(subselect, axis=0)
+                    dates         = ncf.get_variable('date').take(subselect, axis=0)
+                    dates         = array([dtm.datetime(*d) for d in dates])
+                    av_kernel     = ncf.get_variable('averaging_kernel').take(subselect, axis=0)
+                    prior_profile = ncf.get_variable('prior_profile').take(subselect, axis=0)
+                    pressure      = ncf.get_variable('pressure_levels').take(subselect, axis=0)
+
+                    prior         = ncf.get_variable('prior').take(subselect, axis=0)
+
+                    ##### freum vvvv
+                    pwf           = ncf.get_variable('pressure_weighting_function').take(subselect, axis=0)
+
+                    # Additional variable surface pressure in case the profiles are defined as layer averages
+                    if level_def == "layer_average":
+                        psurf = ncf.get_variable('surface_pressure').take(subselect, axis=0)
+                    else:
+                        psurf = [float('nan')]*len(ids)
+
+                    # Optional: footprint corners
+                    latc = dict(
+                        latc_0=[float('nan')]*len(ids),
+                        latc_1=[float('nan')]*len(ids),
+                        latc_2=[float('nan')]*len(ids),
+                        latc_3=[float('nan')]*len(ids))
+                    lonc = dict(
+                        lonc_0=[float('nan')]*len(ids),
+                        lonc_1=[float('nan')]*len(ids),
+                        lonc_2=[float('nan')]*len(ids),
+                        lonc_3=[float('nan')]*len(ids))
+                    # If  one footprint corner variable  is there, assume 
+                    # all are there. That's the only case that makes sense
+                    if 'latc_0' in list(ncf.variables.keys()):
+                        latc['latc_0']    = ncf.get_variable('latc_0').take(subselect, axis=0)
+                        latc['latc_1']    = ncf.get_variable('latc_1').take(subselect, axis=0)
+                        latc['latc_2']    = ncf.get_variable('latc_2').take(subselect, axis=0)
+                        latc['latc_3']    = ncf.get_variable('latc_3').take(subselect, axis=0)
+                        lonc['lonc_0']    = ncf.get_variable('lonc_0').take(subselect, axis=0)
+                        lonc['lonc_1']    = ncf.get_variable('lonc_1').take(subselect, axis=0)
+                        lonc['lonc_2']    = ncf.get_variable('lonc_2').take(subselect, axis=0)
+                        lonc['lonc_3']    = ncf.get_variable('lonc_3').take(subselect, axis=0)
+                    ###### freum ^^^^
+
+                    ncf.close()
+
+                    # Add samples to datalist
+                    # Note that the mdm is initialized here equal to the measurement uncertainty. This value is used in add_model_data_mismatch to calculate the mdm including model error
+                    for n in range(len(ids)):
+                        # Check for every sounding if time is between start and end time (relevant for first and last days of window)
+                        if self.startdate <= dates[n] <= self.enddate:
+                            self.datalist.append(TotalColumnSample(ids[n], code, dates[n], obs[n], None, lats[n], lons[n], unc[n], prior[n], prior_profile[n,:], \
+                                                                    av_kernel=av_kernel[n,:], pressure=pressure[n,:], pressure_weighting_function=pwf[n,:],level_def=level_def,psurf=psurf[n],
+                                                                    ##### freum vvvv
+                                                                    latc_0=latc['latc_0'][n], latc_1=latc['latc_1'][n], latc_2=latc['latc_2'][n], latc_3=latc['latc_3'][n],
+                                                                    lonc_0=lonc['lonc_0'][n], lonc_1=lonc['lonc_1'][n], lonc_2=lonc['lonc_2'][n], lonc_3=lonc['lonc_3'][n]
+                                                                    ##### freum ^^^^
+                                                                    ))
+
+                    logging.debug("Added %d observations to the Data list" % (len(self.datalist)-len_init))
+
+                ndays += 1
+
+        del i
+
+        if len(self.datalist) > 0:
+            logging.info("Observations list now holds %d values" % len(self.datalist))
+        else:
+            logging.info("No observations found for sampling window")
+
+
+
+    def add_model_data_mismatch(self, filename=None, advance=False):
+        """ This function is empty: model data mismatch calculation is done during sampling in observation operator (TM5) to enhance computational efficiency
+            (i.e. to prevent reading all soundings twice and writing large additional files)
+
+        """
+        obs_data = rc.read(self.obs_file)
+        self.rejection_threshold = int(obs_data['obs.rejection.threshold'])
+
+        # At this point mdm is set to the measurement uncertainty only, added in the add_observations function.
+        # Here this value is used to set the combined mdm by adding an estimate for the model uncertainty as a sum of squares.
+        if len(self.datalist) <= 1: return #== 0: return
+        for obs in self.datalist:
+            # parametrization as used by Frederic Chevallier
+            if self.mdm_calculation == 'parametrization':
+                obs.mdm = ( obs.mdm*obs.mdm + (0.8*np.exp((90.0+obs.lat)/300.0))**2 )**0.5
+            # empirical approach of Andy Jacobson, TO BE IMPLEMENTED
+            # elif self.mdm_calculation == 'empirical':
+            #     obs.mdm = ...
+            elif self.mdm_calculation == 'no_transport_error':
+                pass
+            else: # assume general model uncertainty of 1 ppm (arbitrary value)
+                obs.mdm = ( obs.mdm*obs.mdm + 1**2 )**0.5
+        del obs
+
+        meanmdm = np.average(np.array( [obs.mdm for obs in self.datalist] ))
+        logging.debug('Mean MDM = %s' %meanmdm)
+
+
+
+    def add_simulations(self, filename, silent=False):
+        """ Adds observed and model simulated column values to the mole fraction objects
+            This function includes the add_observations and add_model_data_mismatch functionality for the sake of computational efficiency
+
+        """
+
+        if self.sample_in_ctdas:
+            logging.debug("CODE TO ADD SIMULATED SAMPLES TO DATALIST TO BE ADDED")
+
+        else:
+            # read simulated samples from file
+            if not os.path.exists(filename):
+                msg = "Sample output filename for observations could not be found : %s" % filename
+                logging.error(msg)
+                logging.error("Did the sampling step succeed?")
+                logging.error("...exiting")
+                raise IOError(msg)
+
+            ncf       = io.ct_read(filename, method='read')
+            ids       = ncf.get_variable('sounding_id')
+            simulated = ncf.get_variable('column_modeled')
+            ncf.close()
+            logging.info("Successfully read data from model sample file (%s)" % filename)
+
+            obs_ids = self.getvalues('id').tolist()
+
+            missing_samples = []
+
+            # Match read simulated samples with observations in datalist
+            logging.info("Adding %i simulated samples to the data list..." % len(ids))
+            for i in range(len(ids)):
+                # Assume samples are in same order in both datalist and file with simulated samples...
+                if ids[i] == obs_ids[i]:
+                    self.datalist[i].simulated = simulated[i]
+                # If not, find index of current sample
+                elif ids[i] in obs_ids:
+                    index = obs_ids.index(ids[i])
+                    # Only add simulated value to datalist if sample has not been filled before. Otherwise: exiting
+                    if self.datalist[index].simulated is not None:
+                        msg = 'Simulated and observed samples not in same order, and duplicate sample IDs found.'
+                        logging.error(msg)
+                        raise IOError(msg)
+                    else:
+                        self.datalist[index].simulated = simulated[i]
+                else:
+                    logging.debug('added %s to missing_samples, obs id = %s' %(ids[i],obs_ids[i]))
+                    missing_samples.append(ids[i])
+            del i
+
+            if not silent and missing_samples != []:
+                logging.warning('%i Model samples were found that did not match any ID in the observation list. Skipping them...' % len(missing_samples))
+
+            # if number of simulated samples < observations: remove observations without samples
+            if len(simulated) < len(self.datalist):
+                test = len(self.datalist) - len(simulated)
+                logging.warning('%i Observations were not sampled, removing them from datalist...' % test)
+                for index in reversed(list(range(len(self.datalist)))):
+                    if self.datalist[index].simulated is None:
+                        del self.datalist[index]
+                del index
+
+            logging.debug("%d simulated values were added to the data list" % (len(ids) - len(missing_samples)))
+
+
+
+    def write_sample_coords(self, obsinputfile):
+        """
+            Write empty sample_coords_file if soundings are present in time interval, just such that general pipeline code does not have to be changed...
+        """
+
+        if self.sample_in_ctdas:
+            return
+
+        if len(self.datalist) <= 1: #== 0:
+            logging.info("No observations found for this time period, no obs file written")
+            return
+
+        # write data required by observation operator for sampling to file
+        f = io.CT_CDF(obsinputfile, method='create')
+        logging.debug('Creating new observations file for ObservationOperator (%s)' % obsinputfile)
+
+        dimsoundings = f.add_dim('soundings', len(self.datalist))        
+        dimdate      = f.add_dim('epoch_dimension', 7)
+        dimchar      = f.add_dim('char', 20)
+        if len(self.datalist) == 1:
+            dimlevels = f.add_dim('levels', len(self.getvalues('pressure')))
+            # freum: inserted but commented Liesbeth's new code for layers for reference,
+            # but I handle them differently.
+            # if len(self.getvalues('av_kernel')) != len(self.getvalues('pressure')):
+            #     dimlayers = f.add_dim('layers',len(self.getvalues('av_kernel')))
+            #     layers = True
+            # else: layers = False
+        else:
+            dimlevels = f.add_dim('levels', self.getvalues('pressure').shape[1])
+            # if self.getvalues('av_kernel').shape[1] != self.getvalues('pressure').shape[1]:
+            #     dimlayers = f.add_dim('layers', self.getvalues('pressure').shape[1] - 1)
+            #     layers = True
+            # else: layers = False
+
+        savedict           = io.std_savedict.copy()
+        savedict['dtype']  = "int64"
+        savedict['name']   = "sounding_id"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('id').tolist()
+        f.add_data(savedict)
+
+        data = [[d.year, d.month, d.day, d.hour, d.minute, d.second, d.microsecond] for d in self.getvalues('xdate') ]
+        savedict           = io.std_savedict.copy()
+        savedict['dtype']  = "int"
+        savedict['name']   = "date"
+        savedict['dims']   = dimsoundings + dimdate
+        savedict['values'] = data
+        f.add_data(savedict)
+
+        savedict           = io.std_savedict.copy()
+        savedict['name']   = "latitude"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('lat').tolist()
+        f.add_data(savedict)
+
+        savedict           = io.std_savedict.copy()
+        savedict['name']   = "longitude"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('lon').tolist()
+        f.add_data(savedict)
+
+        savedict = io.std_savedict.copy()
+        savedict['name']   = "prior"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('prior').tolist()
+        f.add_data(savedict)
+
+        savedict = io.std_savedict.copy()
+        savedict['name']   = "prior_profile"
+        savedict['dims']   = dimsoundings + dimlevels
+        savedict['values'] = self.getvalues('prior_profile').tolist()
+        f.add_data(savedict)
+
+        savedict = io.std_savedict.copy()
+        savedict['name']   = "averaging_kernel"
+        savedict['dims']   = dimsoundings + dimlevels
+        savedict['values'] = self.getvalues('av_kernel').tolist()
+        f.add_data(savedict)
+
+        savedict = io.std_savedict.copy()
+        savedict['name']   = "pressure_levels"
+        savedict['dims']   = dimsoundings + dimlevels
+        savedict['values'] = self.getvalues('pressure').tolist()
+        f.add_data(savedict)
+
+        # freum vvvv
+        savedict = io.std_savedict.copy()
+        savedict['name']   = "pressure_weighting_function"
+        savedict['dims']   = dimsoundings + dimlevels
+        savedict['values'] = self.getvalues('pressure_weighting_function').tolist()
+        f.add_data(savedict)
+        
+        savedict           = io.std_savedict.copy()
+        savedict['name']   = "latc_0"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('latc_0').tolist()
+        f.add_data(savedict)
+        
+        savedict           = io.std_savedict.copy()
+        savedict['name']   = "latc_1"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('latc_1').tolist()
+        f.add_data(savedict)
+        
+        savedict           = io.std_savedict.copy()
+        savedict['name']   = "latc_2"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('latc_2').tolist()
+        f.add_data(savedict)
+        
+        savedict           = io.std_savedict.copy()
+        savedict['name']   = "latc_3"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('latc_3').tolist()
+        f.add_data(savedict)
+        
+        savedict           = io.std_savedict.copy()
+        savedict['name']   = "lonc_0"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('lonc_0').tolist()
+        f.add_data(savedict)
+
+        savedict           = io.std_savedict.copy()
+        savedict['name']   = "lonc_1"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('lonc_1').tolist()
+        f.add_data(savedict)
+
+        savedict           = io.std_savedict.copy()
+        savedict['name']   = "lonc_2"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('lonc_2').tolist()
+        f.add_data(savedict)
+
+        savedict           = io.std_savedict.copy()
+        savedict['name']   = "lonc_3"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('lonc_3').tolist()
+        f.add_data(savedict)
+        
+        # freum ^^^^
+
+        savedict = io.std_savedict.copy()
+        savedict['dtype']  = "char"
+        savedict['name']   = "level_def"
+        savedict['dims']   = dimsoundings + dimchar
+        savedict['values'] = self.getvalues('level_def').tolist()
+        f.add_data(savedict)
+
+        savedict = io.std_savedict.copy()
+        savedict['name']   = "psurf"
+        savedict['dims']   = dimsoundings
+        savedict['values'] = self.getvalues('psurf').tolist()
+        f.add_data(savedict)
+
+        f.close()
+
+
+
+
+
+################### End Class TotalColumnObservations ###################
+
+
+if __name__ == "__main__":
+    pass
diff --git a/da/obsoperators/observationoperator_wrfchem.py b/da/obsoperators/observationoperator_wrfchem.py
new file mode 100755
index 0000000000000000000000000000000000000000..a2e9bc2fe9ee0ab453d0fd8cea3d644d50394ae1
--- /dev/null
+++ b/da/obsoperators/observationoperator_wrfchem.py
@@ -0,0 +1,1400 @@
+"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters.
+Users are recommended to contact the developers (wouter.peters@wur.nl) to receive
+updates of the code. See also: http://www.carbontracker.eu.
+
+This program is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software Foundation,
+version 3. This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with this
+program. If not, see <http://www.gnu.org/licenses/>.
+
+Author : Friedemann Reum
+
+Revision History:
+File created on 26 March 2019.
+
+This module holds functions needed to use the WRF-Chem model within CTDAS. It
+is based on the TM5 observation operator by Wouter Peters.
+
+The input is an rc file that defines a source directory and a run directory.
+The source directory is a directory in which wrf.exe can be called
+(e.g. WRFV3/run/). I.e., it assumes that WRF-Chem was compiled, all
+preprocessing was done and the outputs linked to the run
+directory, and the correct namelist.input is there. namelist.input will
+be modified in the run directory (e.g. start- and end times adapted to
+each cycle).
+"""
+
+# Instructions for pylint (e.g. for code analysis in Spyder):
+# pylint: disable=too-many-instance-attributes
+# pylint: disable=W0201
+# pylint: disable=C0301
+
+# Python libraries
+import os
+import logging
+import shutil
+import subprocess
+import glob
+import re
+import datetime as dtm
+import numpy as np
+import netCDF4 as nc
+import f90nml
+
+# CTDAS modules
+import da.tools.rc as rc
+from da.tools.general import create_dirs
+from da.obsoperators.observationoperator_baseclass import ObservationOperator
+from da.tools.wrfchem.wrfchem_helper import WRFChemHelper
+from da.tools.wrfchem.utilities import utilities
+
+identifier = "wrfchem"
+version = "1.0"
+
+class WRFChemOO(ObservationOperator):
+    """This class holds methods and variables that are needed to run the WRF-Chem model."""
+
+    ########## Public methods
+
+    def __init__(self, rc_filename, dacycle=None):
+        """Initialize WRFChemOO using settings in rc_filename."""
+        
+        # Identify yourself
+        self.ID = identifier
+        self.version = version
+        
+        # Load settings
+        self._load_rc(rc_filename)
+        self._validate_rc()
+
+        # A hardcoded setting (doesnt make sense to allow user to change
+        # it)
+        self.settings["original_save_suffix"] = ".original"
+        
+        # Instantiate a WRFChemHelper object
+        self.wrfhelper = WRFChemHelper(self.settings)
+        self.wrfhelper.validate_settings(["run_dir"])
+
+        logging.info("Observation Operator initialized: %s (%s)",
+                     self.ID, self.version)
+
+
+    def setup(self, dacycle):
+        """Prepare the observation operator for running.
+        
+        This includes reading settings and preparing the run directory.
+        """
+        
+        # Attach the CycleControl object (used for exchanging
+        # information between classes)
+        self.dacycle = dacycle
+        
+        # Infer whether initial and boundary conditions will be
+        # optimized. Only a constant offset can be optimized,
+        # hence the name.
+        self.dacycle.dasystem["do_offset"] = \
+            "sigma_offset" in list(self.dacycle.dasystem.keys())
+        
+        # Set up the directory where WRF is to be run.
+        # Only do this if you are the first data assimilation cycle. The
+        # WRF run directory is reused in subsequent runs to save both
+        # space and time.
+        if self.dacycle["time.restart"] is False:
+            logging.info("This is the first data assimilation cycle.")
+
+            # Not overwriting existing files, so that if a run is
+            # started again (e.g. when it fails), the existing files can
+            # be used. This makes sense because setting up the directory
+            # can take a long time because WRF input files for ensemble
+            # runs are large.
+            overwrite = False
+
+            self._create_fresh_run_dir(overwrite)
+
+            logging.info("Fresh run dir for WRFChem created: %s",
+                         self.settings["run_dir"])
+
+        # Read WRF namelist (= configuration file)
+        self.namelist = self.wrfhelper.read_namelist(self.settings["run_dir"])
+        
+        # Also tell wrfhelper about the namelist.
+        self.wrfhelper.namelist = self.namelist
+
+        # Some setup tasks only for the first data assimilation cycle.
+        if self.dacycle["time.restart"] is False:
+            # Prepare fluxes files for modification by optimizer
+            self._prepare_flux_files(overwrite)
+            
+            # In case initial/boundary conditions are optimized, save
+            # original wrfinput and wrfbdy files. They serve as
+            # sources for prior ic/bc.
+            if self.dacycle.dasystem["do_offset"]:
+                wrfinput = self.wrfhelper.get_wrf_filenames("wrfinput_d*")
+                wrfbdy = self.wrfhelper.get_wrf_filenames("wrfbdy_d01")
+                self._save_original_files(wrfinput + wrfbdy, overwrite)
+                logging.info("Original initial- and boundary " + \
+                             "conditions files saved.")
+        
+        logging.info("Setup done.")
+
+
+    def get_initial_data(self):
+        """Nothing to do, all preparations moved to _prepare_run"""
+        pass
+
+
+    def run_forecast_model(self, samples=None):
+        """Run the transport model and sample simulated observations"""
+
+        self._prepare_run(samples)
+        self._validate_input()
+        self._run()
+        self._sample_simulated_obs(samples)
+        self._save_data()
+
+
+    def write_members_to_file(self, lag, outdir, statevector):#,endswith=".nc"):
+        """Write ensemble member information to file for WRFChem
+
+        This writes fluxes in wrfchemi* files for one lag. This is based
+        on the statevector object, which is passed as argument, the
+        regionsfile and the namelist (for dealing with nests).
+
+        In case an offset is optimized, wrfinput and wrfbdy files are
+        also modified accordingly.
+
+        "outdir" is not used.        
+        """
+
+        # Some abbreviations:
+
+        # Data to write
+        members = statevector.ensemble_members[lag]        
+        add_flux_fix = "flux_fix" in list(self.settings.keys())
+        # Number of flux parameters per flux process.
+        # freum 2021-07-11: Use statevector.nparams_proc. This here
+        # would have been wrong if an offset was included and multiple
+        # emission processes used.
+        # nparams = int(self.dacycle.dasystem["nparameters"])
+        # nparams_proc = nparams/int(self.dacycle.dasystem["n_emis_proc"])
+        nparams_proc = statevector.nparams_proc
+        # Number of flux processes
+        n_emis_proc = int(self.dacycle.dasystem["n_emis_proc"])
+        # WRF configuration
+        run_dir = self.settings["run_dir"]
+        n_domains = self.namelist["domains"]["max_dom"]
+        nmember_format = "%03d"
+
+
+        # Get member parameter maps for each domain
+        param_map_mem_dom = list()
+        for nproc in range(n_emis_proc):
+            # New parameter map list for this flux process
+            param_map_mem_dom.append(list())            
+            for mem in members:
+                # Get full parameter map for this member
+                param_map_mem = statevector.vector2grid(
+                    mem.param_values[(nparams_proc*nproc):(nparams_proc*(nproc+1))])
+                # Get domain-specific parameter maps from full map
+                nm = mem.membernumber
+                param_map_mem_dom[nproc].append(list())
+                for domain in range(1, n_domains+1):
+                    param_map_mem_dom[nproc][nm].append(self._sample_domain_map(param_map_mem, domain))
+
+        # Write fluxes for each domain:
+        
+        # Times that correspond to this lag
+        time_lag_start = self.dacycle["time.sample.start"]
+        time_lag_end = self.dacycle["time.sample.end"]
+        
+        for domain in range(1, n_domains+1): # WRF domain numbers start at 1
+            
+            # Get the start times of wrfchemi files that belong to this lag
+            # First, get all times
+            file_times = self.wrfhelper.wrf_filename_times("wrfchemi_d%02d_"%domain)
+            file_times.sort()
+            # Then filter for the current lag.
+            file_times_lag = [file_times[i] for i in range(len(file_times))
+                              if
+                                  file_times[i] >= time_lag_start
+                                  and file_times[i] < time_lag_end
+                             ]
+
+            # Loop over flux files
+            for file_time in file_times_lag:
+                # Get file path
+                fn = "".join(["wrfchemi_d%02d_" % domain,
+                              file_time.strftime("%Y-%m-%d_%H:%M:%S")])
+                fp = os.path.join(run_dir, fn)
+
+                # Path to original file
+                ofp = fp + self.settings["original_save_suffix"]
+
+                # Which times in file to modify?
+                # Warning: I always use flux files with only one time
+                # index, so this part isn't tested.
+
+                # First, get all times
+                src = nc.Dataset(ofp, "r")
+                flux_times = self.wrfhelper.times_in_wrf_file(src)
+                # Then filter for the current lag
+                indices = [i for i in range(len(flux_times))
+                           if
+                               flux_times[i] >= time_lag_start
+                               and flux_times[i] < time_lag_end
+                          ]
+                # And finally make indices for slicing
+                # This assumes that flux times are monotonically
+                # increasing, so check that.
+                if np.any(np.diff(indices) != 1):
+                    raise ValueError("Times in flux file " + \
+                                     "'%s' " % ofp + \
+                                     "are not monotonically increasing")
+                i0 = indices[0]
+                i1 = indices[-1] + 1
+                
+                logging.debug("Writing fluxes to file %s, " % fp + \
+                              "time index slice %d:%d" %(i0, i1))
+
+                # Open destination file
+                dst = nc.Dataset(fp, "r+")
+
+                # Read prior fluxes
+                prefix = self.settings["flux_optim"]
+                src_field = list()
+                for nproc in range(n_emis_proc):
+                    if n_emis_proc==1:
+                        src_varname = prefix
+                    else:
+                        src_varname = prefix + "_proc_" + str(nproc+1)
+                    src_var = src.variables[src_varname]
+                    src_field.append(src_var[i0:i1, :, :, :])
+
+                # Read fixed fluxes (if required)
+                if add_flux_fix:
+                    fix_var = src.variables[self.settings["flux_fix"]]
+                    fix_field = fix_var[i0:i1, :, :, :]
+
+                # Write fluxes for every ensemble member
+                for mem in members:
+
+                    # Test:
+                    # For testing the orienation of param_map_domain and
+                    # src_field, make the bottom right corner 1.
+                    # param_map_domain[0, -1] = 1
+                    # src_field[0, 0, 0, -1] = 1 # This was before n_emis_proc
+
+                    # Multiply prior fluxes with statevector map
+                    field_state = np.zeros_like(src_field[0])
+                    nm = mem.membernumber
+                    for nproc in range(n_emis_proc):
+                        for nt in range(field_state.shape[0]):
+                            for nz in range(field_state.shape[1]):
+                                field_state[nt, nz, :, :] = (
+                                    field_state[nt, nz, :, :]
+                                    + src_field[nproc][nt, nz, :, :]
+                                      * param_map_mem_dom[nproc][nm][domain-1][:]
+                                    )
+
+                                # For testing whether the domain map is correct:
+                                # field_state[nt, nz, :, :] = param_map_domain[:]
+
+                    # Add fixed fluxes
+                    if add_flux_fix:
+                        field_state += fix_field
+
+                    # Write to file handle (written to file when
+                    # dst.close() is called)
+                    varname = prefix + "_" + nmember_format % nm
+                    dst.variables[varname][i0:i1, :, :, :] = field_state[:]
+                
+                src.close()
+                dst.close()
+                
+                # For debugging, copy dst dataset
+                #path = os.path.join(self.dacycle["dir.da_run"], "output")
+                #time_lag_start = self.dacycle["time.sample.start"]
+                #timestamp = time_lag_start.strftime("_lag%Y%m%d")
+                #self.wrfhelper.save_file_with_timestamp(fp, path, timestamp)
+        
+        # Add offset to wrfinput and wrfbdy
+        # Note: Everything but a constant offset would impact _BTXE
+        # and so on in wrfbdy. So only implementing a constant offset.
+        if self.dacycle.dasystem["do_offset"]:
+            if self.dacycle["time.restart"]:
+                logging.warning(
+                    "do_offset = True and time.restart = True. Is " + \
+                    "this an advance step? If not, you are " + \
+                    "attempting to optimize initial and boundary " + \
+                    "conditions in a run with more than one cycle. " + \
+                    "This would produce an inconsistent result, " + \
+                    "because initial conditions also impact " + \
+                    "observations in previous cycles, but this " + \
+                    "isn't accounted for. Skipping."
+                )
+            else:
+                # Write wrfbdy
+                fp = os.path.join(self.settings["run_dir"], "wrfbdy_d01")
+                ofp = fp + self.settings["original_save_suffix"]
+                src = nc.Dataset(ofp, "r")
+                dst = nc.Dataset(fp, "r+")
+                quantities = ["BXS", "BXE", "BYS", "BYE"]
+                for mem in members:
+                    offset_value = mem.param_values[-1]
+                    var_names = [self.settings["tracer_optim"] + \
+                                 "_" + nmember_format % mem.membernumber + \
+                                 "_" + q for q in quantities]
+                    for var_name in var_names:
+                        var_src = src.variables[var_name][:]
+                        dst.variables[var_name][:] = var_src + offset_value
+                src.close()
+                dst.close()
+            
+                # Write wrfinput
+                for domain in range(1, n_domains+1):
+                    fp = os.path.join(self.settings["run_dir"],
+                                      "wrfinput_d%02d" %  domain)
+                    ofp = fp + self.settings["original_save_suffix"]
+                    src = nc.Dataset(ofp, "r")
+                    dst = nc.Dataset(fp, "r+")
+                    for mem in members:
+                        offset_value = mem.param_values[-1]
+                        var_name = self.settings["tracer_optim"] + \
+                                   "_" + nmember_format % mem.membernumber
+                        var_src = src.variables[var_name][:]
+                        dst.variables[var_name][:] = var_src + offset_value
+                    src.close()
+                    dst.close()
+        
+        logging.info("write_members_to_file done.")
+
+
+    ########## Private methods
+
+    def _prepare_run(self, samples):
+        """Prepare a forward model WRF-Chem run.
+
+        - Update the WRF's configuration file (namelist.input)
+        - Define file name where sampled simulated observations
+          are stored.
+        """
+
+        # Update parameters for new run (i.e., start and stop time)
+        self._update_namelist_file()
+
+        # Define file name of sampled simulated observations
+        # Store pattern for finding the right file in write_simulated*,
+        # because they don't know the integer indices
+        self._sim_fpattern = "wrf_sampled_%%s_%s.nc" % self.dacycle["time.sample.stamp"]
+
+        # Also construct full filename because pipeline relies on
+        # integer indices
+        self.simulated_file = [None] * len(samples)
+        for i, sample in enumerate(samples):
+            self.simulated_file[i] = os.path.join(
+                self.settings["run_dir"],
+                self._sim_fpattern % sample.get_samples_type())
+
+        # Prepare the track input file - NOT USED
+        #self._prepare_wrfinput_track()
+
+
+
+    def _update_namelist_file(self):
+        """Update namelist.input file based on dacycle object
+
+        Updated values:
+        - start time, end time
+        - restart timestep
+        - restart flag
+
+        Note: format of namelist.input is slightly different after
+        running this:
+        - 19, 19, -> 19, 19
+        - 00 -> 0
+        - 2. -> 2.0
+        - "wrfchemi_d<domain>_<date>" -> "wrfchemi_d<domain>_<date>"
+        This could be adjusted in f90nml settings, but it doesn't
+        seem to matter to WRF.
+        """
+
+        # Set start and end time in namelist
+        self._set_namelist_time("start_", self.dacycle["time.sample.start"])
+        self._set_namelist_time("end_", self.dacycle["time.sample.end"])
+
+        # Set restart interval
+        # Compute it (as integer)
+        intrvl_timedelta = (self.dacycle["time.sample.end"]
+                                - self.dacycle["time.sample.start"])
+        intrvl_mins = intrvl_timedelta.total_seconds()/60.
+        if np.abs(intrvl_mins-np.floor(intrvl_mins)) > np.finfo(float).eps:
+            msg = "Cannot safely cast restart_interval in WRF " + \
+                  "namelist to integer. Without this casting, " + \
+                  "restart-files might not be created. Stopping."
+            raise ValueError(msg)
+        # Write to namelist
+        self.namelist["time_control"]["restart_interval"] = int(intrvl_mins)
+
+        # Set restart flag
+        # If this is a restart from a previous cycle, or a previous lag,
+        # the WRF model should start from restart files
+        # (time.sample.window is also used by TM5 observation operator to set
+        # the restart flag, so I guess this is the right value to use...)
+        if self.dacycle["time.restart"] or self.dacycle["time.sample.window"] != 0:
+            logging.info("Configuring WRF-Chem to perform restart.")
+            self.namelist["time_control"]["restart"] = True
+        else:
+            # Note: In case you ever want to start the first cycle with
+            # restart files, change this branch to "pass"
+
+            logging.info("Configuring WRF-Chem to perform cold start.")
+
+            self.namelist["time_control"]["restart"] = False
+
+        # Write namelist changes to file
+        fp = os.path.join(self.settings["run_dir"], "namelist.input")
+        fp_original = fp + self.settings["original_save_suffix"]
+        f90nml.patch(fp_original, self.namelist, fp)
+
+    def _set_namelist_time(self, prefix, date):
+        """Insert datetime object into namelist dictionary."""
+
+        if not isinstance(date, dtm.datetime):
+            logging.error("set_namelist_time: argument date must be a " + \
+                          "datetime.datetime- object")
+
+        # Incidentally, names of datetime components WRF namelists are
+        # identical to those in datetime objects. Therefore, we can set
+        # them by iterating over their names.
+        time_components = ["year", "month", "day", "hour", "minute", "second"]
+        for time_component in time_components:
+            self.namelist["time_control"][prefix + time_component] = \
+                self._repeat_per_domain(getattr(date, time_component))
+
+
+    def _repeat_per_domain(self, value):
+        """Output: list of 'value' repeated max_dom times."""
+
+        n_domains = self.namelist["domains"]["max_dom"]
+        return [value] * n_domains
+
+
+    # def _prepare_wrfinput_track(self):
+    #     """Prepares wrfinput_track.txt for this run
+    #     
+    #     Original version of track_driver can't deal
+    #     with times outside of simulation period. So
+    #     I'm cutting the original file down here.
+    #     STATUS: NOT USED
+    #     """
+    # 
+    #     run_dir = self.settings["run_dir"]
+    #     fp = os.path.join(run_dir, "wrfinput_track.txt")
+    #     fp_original = fp + self.settings["original_save_suffix"]
+    # 
+    #     if not os.path.exists(fp_original):
+    #         return
+    # 
+    #     # Read original track
+    #     # Read all as str: have to parse times, and will keep format of
+    #     # locations
+    #     dat = np.loadtxt(fp_original, dtype=str)
+    # 
+    #     # Select lines that are in current lag
+    #     times = [dtm.datetime.strptime(t_chr, "%Y-%m-%d_%H:%M:%S") for t_chr in dat[:, 0]]
+    # 
+    #     time_lag_start = self.dacycle["time.sample.start"]
+    #     time_lag_end = self.dacycle["time.sample.end"]
+    # 
+    #     # Write track file for this run
+    #     # Have to exclude 1st and include last timestep here because that's the
+    #     # way track_driver needs it!
+    #     # This is unlike the other cases (fluxes and wrfout), where
+    #     # I include the start and exclude the end of the lag
+    #     f = open(fp, "w")
+    #     for n in range(dat.shape[0]):
+    #         if times[n] > time_lag_start and times[n] <= time_lag_end:
+    #             f.write(" ".join(dat[n, :]) + "\n")
+    #     f.close()
+
+
+    def _load_rc(self, name):
+        """Read settings from the observation operator's rc-file
+        
+        Based on TM5ObservationOperator.load_rc
+        """
+
+        self.rcfile = rc.RcFile(name)
+        self.settings = self.rcfile.values
+        self.rc_filename = name
+
+        logging.debug("rc-file %s loaded", name)
+
+    def _validate_rc(self):
+        """Check that some required values are given in the rc-file.
+
+        Based on TM5ObservationOperator.validate_rc
+        """
+
+        needed_rc_items = ["source_dir", "run_dir"]
+
+        for key in needed_rc_items:
+            if key not in self.settings:
+                msg = "Missing a required value in rc-file : %s" % key
+                logging.error(msg)
+                raise IOError(msg)
+        logging.debug("rc-file has been validated succesfully")
+
+    def _create_fresh_run_dir(self, overwrite):
+        """ Set up the  directory where WRF will run.
+
+        Steps:
+        - Create target_directory
+        - From self.settings["source_dir"] to target_directory,
+          - copy files that need to be modified during runtime
+          - link files that need not be modified
+        - Useful to set overwrite=False for resubmitting a
+          failed run. Note: namelist.input is _always_ copied
+          regardless of the value of overwrite.
+        """
+
+        # Create target_dir
+        target_dir = self.settings["run_dir"]
+        create_dirs(target_dir)
+
+        # By default, files are linked from source directory to
+        # target_dir. Define here which files need to be copied
+        # instead of linked, i.e., those that need to be modified
+        # During runtime.
+        # They are described by patterns that are compared to file
+        # names via re.match(pattern, filename)
+        copy_pattern_list = ["^namelist\.input$",
+                             "^wrfchemi_.*",
+                             # "^wrfinput_track\.txt$" # NOT USED
+                            ]
+        
+        if self.dacycle.dasystem["do_offset"]:
+            copy_pattern_list.append("wrfinput_d.*")
+            copy_pattern_list.append("wrfbdy_d.*")
+                
+        # Since I don't know concise code to check for more than one
+        # pattern without a loop, create one single pattern for all.
+        copy_full_pattern = "(" + "|".join(copy_pattern_list) + ")"
+
+        # Skip some files
+        skip_pattern_list = ["^wrfout.*",
+                             "^wrfrst.*",
+                             "namelist\.output",
+                             ".*~$",
+                             "log",
+                             "^rsl.error.*",
+                             "^rsl.out.*",
+                             "sampled_.*",
+                             "met_em.*"
+                            ]
+        skip_full_pattern = "(" + "|".join(skip_pattern_list) + ")"
+
+        # Get list of all files in source directory
+        source_dir = self.settings["source_dir"]
+        source_files = os.listdir(source_dir)
+
+        # Copy/link files to target_dir
+        logging.info("Populating WRF run directory. This may take a while.")
+        for filename in source_files:
+            # Do nothing for files to skip
+            if re.match(skip_full_pattern, filename):
+                continue
+
+            # Copy/link files
+            fp_source = os.path.join(source_dir, filename)
+            fp_destination = os.path.join(target_dir, filename)
+            # Skip if the file is already there and overwrite is false
+            if (os.path.isfile(fp_destination)) and (not overwrite):
+                continue
+            if re.match(copy_full_pattern, filename):
+                # Copy files to copy (with overwrite)
+                shutil.copy2(fp_source, target_dir)
+            else:
+                # Link files to link (with overwrite)
+                if os.path.exists(fp_destination):
+                    os.remove(fp_destination)
+                os.symlink(fp_source, fp_destination)
+
+        # create a log dir
+        create_dirs(os.path.join(target_dir, "log"))
+
+        # Save original namelist.input - ignore overwrite.
+        # Note: this is useful in case you restart CTDAS with an updated namelist. 
+        fp_source = os.path.join(source_dir, "namelist.input")
+        shutil.copy2(fp_source, target_dir)
+        self._save_original_files([os.path.join(target_dir, "namelist.input")], True)
+
+        # # Save original wrfinput_track.txt (if it exists) - NOT USED
+        # fp_source = os.path.join(source_dir, "wrfinput_track.txt")
+        # if os.path.exists(fp_source):
+        #     shutil.copy2(fp_source, target_dir)
+        #     self._save_original_files([os.path.join(target_dir, "wrfinput_track.txt")], True)
+
+    
+    def _prepare_flux_files(self, overwrite=True):
+        """Prepare flux input files for WRF-Chem
+
+        - Save original flux files 
+        - Create fields for ensemble members if not yet present
+        
+        I want to get rid of saving original files, and the only
+        practical way to do the latter is in preprocessing because
+        here it's not parallelized. So either parallelize or remove.
+        Could replace with _validate_emissions.
+        """
+
+        # Flux files for this inversion period
+        all_files = self.wrfhelper.get_wrf_filenames("wrfchemi*00")
+        
+        ti = self.dacycle["time.start"]
+        days_plus = int(self.dacycle["time.nlag"])*int(self.dacycle["time.cycle"])
+        te = self.dacycle["time.finish"] + dtm.timedelta(days_plus)
+        logging.warning("Processing until: te = " + te.isoformat() + ". Too far?")
+        pattern_time = "%Y-%m-%d_%H:%M:%S"
+        len_tstamp = len(pattern_time) + 2
+        all_times = [dtm.datetime.strptime(f[-len_tstamp:], pattern_time)
+                     for f in all_files]
+        wrfchemi_files = [all_files[nf] for nf in range(len(all_files))
+                          if ti <= all_times[nf] and all_times[nf] <= te]
+
+        # Rename chemistry input files to wrfchemi*.original
+        self._save_original_files(wrfchemi_files, overwrite)
+        logging.info("Original flux files saved.")
+
+        # Write placeholder fields for optimized fluxes into wrfchemi files
+        # -> This takes a lot of time for a large ensemble because it's done
+        # serially. The only practical way is to do this in preprocessing,
+        # so that should become the only allowed way.
+        logging.info("Creating flux fields for ensemble in flux files.")
+        prefix = self.settings["flux_optim"]
+        self._create_member_fields(wrfchemi_files, prefix, "", "zeros", False)
+        logging.info("Flux files prepared.")
+
+
+    def _save_original_files(self, files, overwrite=True):
+        """ Make a copy of files with a predefined suffix
+
+        For development, add the option to skip (because it
+        takes a while).
+        files = list
+        """
+
+        suffix = self.settings["original_save_suffix"]
+        for file_ in files:
+            new_filename = file_ + suffix
+            if (os.path.isfile(new_filename)) and (not overwrite):
+                logging.debug("File %s exists, skipping", new_filename)
+                continue
+            shutil.copy2(file_, new_filename)
+            logging.debug("Copied %s to %s", file_, new_filename)
+
+
+    def _create_member_fields(self, files, prefix, suffix="",
+                              mode="zeros", overwrite=True):
+        """ Create fields for each ensemble member for the original prefix+suffix.
+
+        Since this takes a long time, I add the option to not overwrite in
+        case the fields are already there, i.e. created in preprocessing. 
+        This is actually the only practical way at the moment. Either edit
+        this function out or parallelize.
+
+        This assumes that original files were previously saved with
+        _self._save_original_files().
+
+        mode = "zeros": All zeros (will be used for emissions)
+        mode = "copy": Copy original (was for boundary- and initial
+                       conditions, but that's not done anymore.)
+        files = list
+        Attributes are copied in any case. The member number is added to the
+        "description" attribute.
+
+        """
+
+        # For each domain, and each file, write a field of 0's for each
+        # ensemble member (<prefix>_000, ...)
+        # Netcdf operations from https://stackoverflow.com/questions/15141563/python-netcdf-making-a-copy-of-all-variables-and-attributes-but-one
+
+        # Specify directory to operate in
+        run_dir = self.settings["run_dir"]
+        # suffix for original
+        orig_suf = self.settings["original_save_suffix"]
+        # Number of ensemble members
+        nmembers = int(self.dacycle["da.optimizer.nmembers"])
+        # Format of ensemble members
+        nmember_format = "%03d"
+        # Name of emission fields to copy = prefix+suffix
+        var_orig = prefix+suffix
+        # Create arrays of 0s for complete emission field in all files
+        have_src = False
+        for f in files:
+            # File path
+            fp = os.path.join(run_dir, f)
+            # Get variable that you want to emulate from original file
+            ofp = fp + orig_suf
+            src = nc.Dataset(ofp, "r")
+            src_var = src.variables[var_orig]
+            if mode == "zeros":
+                if not have_src:
+                    # Only create this once
+                    field_to_write = np.zeros(src_var.shape, dtype=src_var.dtype)
+                    have_src=True
+            elif mode == "copy":
+                field_to_write = src_var[:] # always need new source
+            else:
+                raise ValueError("Unknown mode: %s" % mode)
+
+            # Open netcdf handle in new file
+            dst = nc.Dataset(fp, "r+")
+
+            for n in range(nmembers):
+                nmember_formatted = nmember_format % n
+                # field name for this member
+                fn = "".join([prefix, "_", nmember_formatted, suffix])
+                # Skip if it should not be overwritten
+                if (fn in list(dst.variables.keys())) and (not overwrite):
+                    continue
+                # Create 0-field for this member
+                dst.createVariable(fn, src_var.dtype, src_var.dimensions)
+                # Write the data
+                dst.variables[fn][:] = field_to_write
+                # Copy variable attributes all at once via dictionary
+                dst.variables[fn].setncatts(src_var.__dict__)
+                # Add ensemble member number to description of variable
+                dst.variables[fn].setncattr(
+                    "description",
+                    dst.variables[fn].getncattr("description") + \
+                        " ensemble_member_" + nmember_formatted)
+            src.close()
+            dst.close()
+
+
+    def _validate_input(self):
+        """Validate that WRF has all the inputs it needs to run
+
+        Mostly only checks existence of files, not their content.
+        """
+
+        # In all cases: Validate parameters and emissions.
+        self._validate_parameters()
+        #self._validate_emissions() # NOT USED
+
+        # Validate different inputs needed for cold start and restart
+        if self.namelist["time_control"]["restart"]:
+            logging.info("Restart run: Validate for restart files,")
+            self._validate_restart_files()
+        else:
+            logging.info("Cold start: Validate initial and boundary " +
+                         "conditions.")
+            self._validate_boundary_conditions()
+            self._validate_initial_conditions()
+
+
+
+    def _validate_parameters(self):
+        """Checks namelist.input and number of allocated tasks
+        
+        namelist.input (WRF's configuration file) can contain
+        instructions on how many tasks to use. This function
+        checks whether namelist.input itself exists, and the
+        number of allocated tasks is compatible with values
+        in the namelist.
+        """
+
+        # Check if WRF configuration file exists (kinda moot, it's
+        # been read in before...
+        self._confirm_file_exists("namelist.input")
+
+        # Confirm that the allocated number of tasks are enough
+        # in case numbers of tasks were explicitly given in the
+        # namelist.
+        self._validate_ntasks()
+
+
+    def _nprocs_namelist(self):
+        """Compute how many tasks namelist.input requests"""
+
+
+
+        # Check variables in namelist that specify the number of tasks
+        got_x = "nproc_x" in list(self.namelist["domains"].keys())
+        got_y = "nproc_y" in list(self.namelist["domains"].keys())
+
+        got_quilt = "namelist_quilt" in list(self.namelist.keys())
+        if got_quilt:
+            got_iog = "nio_groups" in list(self.namelist["namelist_quilt"].keys())
+            got_iot = "nio_tasks_per_group" in list(self.namelist["namelist_quilt"].keys())
+        else:
+            got_iog = False
+            got_iot = False
+
+        # Compute requested number of compute processes
+        if got_x or got_y:
+            if got_x != got_y:
+                logging.error("nproc_x xor nproc_y specified in WRF " + \
+                              "namelist, don't know how to deal with that.")
+
+            nproc_x = int(self.namelist["domains"]["nproc_x"])
+            nproc_y = int(self.namelist["domains"]["nproc_y"])
+            ntasks_compute = nproc_x*nproc_y
+        else:
+            ntasks_compute = 0
+
+        # Compute requested number of IO processes
+        if got_iog or got_iot:
+            if got_iog != got_iot:
+                logging.error("nio_groups xor nio_tasks_per_group " + \
+                              "specified in WRF namelist, don't " + \
+                              "know how to deal with that.")
+            nio_groups = int(self.namelist["namelist_quilt"]["nio_groups"])
+            nio_tasks_per_group = int(self.namelist["namelist_quilt"]["nio_tasks_per_group"])
+            ntasks_io = nio_groups*nio_tasks_per_group
+        else:
+            ntasks_io = 0
+
+        # The number of requested process is the sum of requested
+        # compute and io tasks
+        nprocs = ntasks_compute + ntasks_io
+        return nprocs
+
+    def _validate_ntasks(self):
+        """ Confirm that the allocated number of tasks are enough
+        
+        In case numbers of tasks were explicitly given in the namelist.
+        """
+
+        # Number of tasks as specified in CycleControl (comes from
+        # rc-file passed to CTDAS)
+        ntasks_cc = int(self.dacycle["da.resources.ntasks"])
+
+        # Number of processes requested by namelist parameters
+        ntasks_namelist = self._nprocs_namelist()
+
+        # Compare the two.
+        if ntasks_namelist > ntasks_cc:
+            logging.error("WRF namelist requests more processes " + \
+                          "than available in the job.")
+        else:
+            logging.info("Number of allocated tasks OK for WRF namelist.")
+
+
+    def _validate_emissions(self):
+        """Does nothing
+
+        Should check if emissions files exist for this period. But
+        this would require diving into how emissions are saved (e.g.
+        per day, hour, ...), so I'm omitting it. If files are missing,
+        the run will crash later.
+        """
+
+        pass
+
+    def _validate_restart_files(self):
+        """Does nothing
+
+        Should check whether the restart files for this run exist,
+        but filenames depend on io_form_restart.
+        """
+
+        #ndomains = self.namelist["domains"]["max_dom"]
+        #time_format = self.dacycle["time.sample.start"].strftime("%Y-%m-%d_%H:%M:%S")
+        #for n_d in range(1, ndomains + 1):
+        #    restart_file = "wrfrst_d{domain:02d}_".format(domain=n_d) + time_format
+        #    self._confirm_file_exists(restart_file)
+        
+        pass
+
+
+    def _validate_boundary_conditions(self):
+        """Check if boundary condition file exists"""
+
+        # Check if boundary conditions file exist
+        self._confirm_file_exists("wrfbdy_d01")
+
+    def _validate_initial_conditions(self):
+        """Check if initial conditions files exist"""
+
+        ndomains = self.namelist["domains"]["max_dom"]
+        for n_d in range(1, ndomains + 1):
+            input_file = "wrfinput_d{:02d}".format(n_d)
+            self._confirm_file_exists(input_file)
+
+    def _confirm_file_exists(self, filename):
+        """Throw an error if <filename> is not present in run_dir"""
+
+        filepath = os.path.join(self.settings["run_dir"], filename)
+        if not os.path.exists(filepath):
+            msg = "File %s not found." % filename
+            logging.error(msg)
+            raise OSError(msg)
+
+        logging.debug("File %s found.", filename)
+
+
+    def _run(self):
+        """Run the WRF model by submitting wrf.exe to slurm.
+
+        Sources on how to do this:
+        - https://stackoverflow.com/questions/32594734/slurm-multiprocessing-python-job
+        - https://pythonspot.com/python-subprocess/
+        - submit_tm5_tools.py
+
+        It would be cleaner to replace "run_command" and so on with
+        calls to the platform module. However, running TM5 is handled
+        by submit_tm5 and submit_tm5_tools.py instead. On the machine
+        where the model was developed, only full nodes can be reserved
+        anyway. So I copy this behavior.
+        """
+
+        # Option to run only if you haven't yet for this period
+        # Useful for debegging everything that comes after running
+        # the transport model.
+
+        if "no_rerun_wrf" in list(self.settings.keys()):
+            if self.settings["no_rerun_wrf"].lower() in ["true", "t", "1"]:
+                if self._has_wrfout():
+                    logging.info("no_rerun_wrf = True and transport " + \
+                                 "model output already present for " + \
+                                 "run period. Therefore, not rerunning " + \
+                                 "model and using existing output.")
+                    return
+            elif self.settings["no_rerun_wrf"].lower() in ["false", "f", "0"]:
+                pass
+            else:
+                msg = "Don't understand value " + \
+                      "'%s' " % self.settings["no_rerun_wrf"] + \
+                      "for rc-option 'no_rerun_wrf'." 
+                logging.error(msg)
+                raise ValueError(msg)
+
+        run_dir = self.settings["run_dir"]
+
+        # Change to wrf run directory
+        cwd = os.getcwd()
+        os.chdir(run_dir)
+        
+        # Build command
+        # Basic mpirun command given in rc-file
+        command_ = self.settings["run_command"]
+        # Get number of tasks
+        ntasks = int(self.dacycle["da.resources.ntasks"])
+        if self.settings["run_ntask_opt"]:
+            command_ += " " + self.settings["run_ntask_opt"] + str(ntasks)
+        # Request a runtime. Not that important, actually, it's just a job step.
+        if self.settings["run_t_opt"]:
+            command_ += " " + self.settings["run_t_opt"] + self.settings["run_t"]
+        # Add executable and redirection of output.
+        command = command_ + " ./wrf.exe >& log/wrf.log"
+        
+        # Run WRF model
+        logging.info("Starting WRF model with command: %s" %command)
+        p = subprocess.Popen(command.split(),
+                             stdout=subprocess.PIPE,
+                             stderr=subprocess.STDOUT)
+        # Wait for process to finish (may not be needed)
+        p.wait()
+        logging.info("WRF model run complete.")
+
+        # Check output
+        retcode = utilities.check_out_err(p)
+        if retcode != 0:
+            raise OSError("WRF returned errors. Check logs in %s" % run_dir)
+
+        # Switch back to previous working directory
+        os.chdir(cwd)
+
+    def _has_wrfout(self):
+        """Check if there are wrfout files for start and end of this lag
+
+        Purpose: Option to not rerun the transport model but go straight
+        to sampling wrfout files, used during development of the sampler.
+        """
+        
+        #pylint: disable=unreachable
+
+        run_dir = self.settings["run_dir"]
+        
+        t1 = self.dacycle["time.sample.start"]
+        t2 = self.dacycle["time.sample.end"]
+        
+        fpattern = "wrfout_d01_%Y-%m-%d_%H:%M:%S"
+        fnames = [t.strftime(fpattern) for t in [t1, t2]]
+        fpaths = [os.path.join(run_dir, fn) for fn in fnames]
+        files_exist = list(map(os.path.exists, fpaths))
+        
+        return all(files_exist)
+
+
+    def _sample_simulated_obs(self, samples):
+        """Sample simulated observations
+
+        Calls functions that sample 4D modeled mixing ratio fields
+        at the times and locations of observations.
+        """
+
+        for sample in samples:
+            sample_type = sample.get_samples_type()
+            if sample_type == "column":
+                logging.info("Starting _launch_wrfout_column_sampling")
+                self._launch_wrfout_column_sampling(sample)
+                logging.info("Finished _launch_wrfout_column_sampling")
+            elif sample_type == "point":
+                self._launch_wrfout_point_sampling(sample)
+            else:
+                logging.error("Unknown sample type: %s",
+                              sample.get_samples_type())
+
+
+    def _launch_wrfout_point_sampling(self, samples):
+        """Raises an error that says it hasn't been implemented.
+        
+        Should sample WRF output at coordinates of point observations.
+        Not implemented yet.
+        """
+
+        msg = "Point sampling not implemented."
+        logging.error(msg)
+        raise NotImplementedError(msg)
+
+
+    def _launch_wrfout_column_sampling(self, sample):
+        """Sample WRF output at coordinates of column observations."""
+
+        run_dir = self.settings["run_dir"]
+
+        sampling_coords_file = os.path.join(
+            self.dacycle["dir.input"],
+            sample.get_samples_type() + \
+            "_coordinates_%s.nc" % self.dacycle["time.sample.stamp"])
+
+        # Reconstruct self.simulated_file[i]
+        out_file = self._sim_fpattern % sample.get_samples_type()
+
+        # Remove intermediate files from a previous sampling job (might
+        # still be there if that one fails)
+        # The file pattern is hardcoded in wrfout_sampler
+        slicefile_pattern = out_file + ".*.slice" 
+        for f in  glob.glob(os.path.join(run_dir, slicefile_pattern)):
+            os.remove(f)
+
+        # Spawn multiple wrfchem_sampler instances,
+        # using at most all processes available
+        nprocs1 = int(self.dacycle["da.resources.ntasks"])
+
+        # Might not want to use that many processes if there are few
+        # observations, because of overhead. Set a minimum number of
+        # observations per process, and reduce the number of
+        # processes to hit that.
+        Nobs = len(sample.datalist)
+        if Nobs == 0:
+            logging.info("No observations, skipping sampling")
+            return
+        
+        # Might want to increase this, no idea if this is reasonable
+        nobs_min = 100
+        nprocs2 = max(1, int(float(Nobs)/float(nobs_min)))
+
+        # Number of processes to use:
+        nprocs = min(nprocs1, nprocs2)
+        
+        # Make run command
+        # For a task with 1 processor, specifically request -N1 because 
+        # otherwise slurm apparently sometimes tries to allocate one task to 
+        # more than one node. Or something like that. See here:
+        # https://stackoverflow.com/questions/24056961/running-slurm-script-with-multiple-nodes-launch-job-steps-with-1-task
+        command_ = self.settings["run_command_1task"]
+
+        # Specifically request memory per process (otherwise the first
+        # one would clog the node and you end up with serial execution)
+        if self.settings["run_mem_opt"]:
+            max_nprocs_per_node = min(int(self.settings["cpus_per_node"]), nprocs)
+            mb_per_process = np.floor(int(self.settings["mem_per_node"])*1024. / float(1+max_nprocs_per_node)) # The +1 is to leave a bit spare memory
+            mem_str = str(int(mb_per_process)) + self.settings["mem_unit_str_MB"]
+            command_ += " " + self.settings["run_mem_opt"] + mem_str
+        
+        # Request a runtime. Not that important, actually, it's just a
+        # job step.
+        if self.settings["run_t_opt"]:
+            command_ = command_ \
+                        + " " + self.settings["run_t_opt"] \
+                        + " " + self.settings["run_t"]
+
+        # Check if output slice files are already present
+        # This shouldn't happen, because they are deleted
+        # a few lines above. But if for some reason (crash)
+        # they are still here, this might lead to funny behavios.
+        if nprocs > 1:
+            output_files = glob.glob(slicefile_pattern)
+            if len(output_files) > 0:
+                msg = "Files that match the pattern of the " + \
+                      "sampler output are already present. Stopping."
+                logging.error(msg)
+                raise OSError(msg)
+
+        # Submit processes
+        procs = list()
+        for nproc in range(nprocs):
+            cmd = " ".join([
+                command_,
+                "python ./da/tools/wrfchem/wrfout_sampler.py",
+                "--nproc %d"                 % nproc,
+                "--nprocs %d"                % nprocs,
+                "--sampling_coords_file %s"  % sampling_coords_file,
+                "--run_dir %s"               % run_dir,
+                "--original_save_suffix %s"  % self.settings["original_save_suffix"],
+                "--nmembers %d"              % int(self.dacycle["da.optimizer.nmembers"]),
+                "--tracer_optim %s"          % self.settings["tracer_optim"],
+                "--outfile_prefix %s"        % out_file,
+                "--footprint_samples_dim %d" % int(self.settings['obs.column.footprint_samples_dim'])
+            ])
+            
+            procs.append(subprocess.Popen(cmd.split(),
+                                          stdout=subprocess.PIPE,
+                                          stderr=subprocess.STDOUT))
+
+        logging.info("Started %d sampling process(es).", nprocs)
+        logging.debug("Command of last process: %s", cmd)
+
+        # Wait for all processes to finish
+        for n in range(nprocs):
+            procs[n].wait()
+
+        # Check for errors
+        retcodes = []
+        for n in range(nprocs):
+            logging.debug("Checking errors in process %d", n)
+            retcodes.append(utilities.check_out_err(procs[n]))
+
+        if any([r != 0 for r in retcodes]):
+            raise RuntimeError("At least one sampling process " + \
+                               "finished with errors.")
+
+        logging.info("All sampling processes finished.")
+
+        # Join output files
+        logging.info("Joining output files.")
+        if nprocs > 1:
+            utilities.cat_ncfiles(run_dir, slicefile_pattern,
+                                  "sounding_id", out_file,
+                                  in_pattern=True)
+
+        logging.info("WRF column output sampled.")
+        logging.info("If samples object carried observations, output " + \
+                     "file written to %s", self.simulated_file)
+
+
+    def _save_data(self):
+        """Saves data from this lag
+
+        This doesn't *have* to do anything. WRF is always started from
+        the same directory and takes care of the restarts itself, so no
+        files have to be collected (which is what happens here for TM5).
+        However, if "save_wrf_step" is set to True in the rc-file, then
+        all outputs and the namelist are copied to a subfolder for
+        debugging.
+         """
+
+        # For compatibility:
+        self.restart_filelist = []
+        self.output_filelist = []
+
+        #
+        
+        # Relevant wrf files that change each time wrf is run
+        if "save_wrf_step" in list(self.settings.keys()):
+            if self.settings["save_wrf_step"].lower() in ["true", "t", "1"]:
+                # Keep some files 
+                time_lag_start = self.dacycle["time.sample.start"]
+                timestamp = time_lag_start.strftime("_lag%Y%m%d")
+                
+                path = os.path.join(self.settings["run_dir"], "wrf"+timestamp)
+                logging.info("Saving wrf files to " + path)
+                
+                if not os.path.exists(path):
+                    os.makedirs(path)
+        
+                # NOT USED
+                # wrfout_track
+                # pattern = os.path.join(self.settings["run_dir"],
+                #                        "wrfout_track_*")
+                # files = glob.glob(pattern)
+                # for f in files:
+                #     self.wrfhelper.save_file_with_timestamp(f, path, timestamp)
+                
+                # Prepare timesteps
+                ti = self.dacycle["time.sample.start"]
+                te = self.dacycle["time.sample.end"]
+                pattern_time = "%Y-%m-%d_%H:%M:%S"
+                len_tstamp = len(pattern_time) + 2
+                
+                # List wrfout files
+                file_pattern = "wrfout*"
+                all_files = self.wrfhelper.get_wrf_filenames(file_pattern)
+                all_times = [dtm.datetime.strptime(f[-len_tstamp:], pattern_time)
+                             for f in all_files]
+                
+                # Save wrfout files that belong to this lag
+                for nf in range(len(all_files)):
+                    if ti <= all_times[nf] and all_times[nf] <= te:
+                        self.wrfhelper.save_file_with_timestamp(all_files[nf], path, timestamp)
+                
+                # Restart files
+                file_pattern = "wrfrst*"
+                all_files = self.wrfhelper.get_wrf_filenames(file_pattern)
+                all_times = [dtm.datetime.strptime(f[-len_tstamp:], pattern_time)
+                             for f in all_files]
+                for nf in range(len(all_files)):
+                    if all_times[nf] == te:
+                        self.wrfhelper.save_file_with_timestamp(all_files[nf], path, timestamp)
+                
+                # rsl files
+                file_pattern = "rsl.*"
+                all_files = self.wrfhelper.get_wrf_filenames(file_pattern)
+                for nf in range(len(all_files)):
+                    self.wrfhelper.save_file_with_timestamp(all_files[nf], path, timestamp)
+                
+                # Namelist
+                in_path = os.path.join(self.settings["run_dir"], "namelist.input")
+                self.wrfhelper.save_file_with_timestamp(in_path, path, timestamp)
+
+
+    # def _save_track_output(self):
+    #     """
+    #     Save output from track_driver by adding a timestamp.
+    #     STATUS: NOT USED
+    #     """
+    #     run_dir = self.settings["run_dir"]
+    #     ndomains = self.namelist["domains"]["max_dom"]
+    #     time_lag_start = self.dacycle["time.sample.start"]
+    #     timestamp = time_lag_start.strftime("%Y-%m-%d_%H:%M:%S")
+    #     # for testing purposes, also add system time
+    #     nowstamp = dtm.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
+    #     timestamp += "_"+nowstamp
+    # 
+    #     for domain in range(1, ndomains+1):
+    #         fn = "wrfout_track_d%02d" % domain
+    #         fp = os.path.join(run_dir, fn)
+    #         if os.path.exists(fp):
+    #             shutil.copy2(fp, fp + "_" + timestamp)
+
+    def _sample_domain_map(self, full_map, domain):
+        """Sample domain map
+
+        Takes as input a <full_map> with values for domain d01 in
+        resolution of the finest domain (= format of regions file,
+        output of vector2grid) and returns values for extent and
+        resolution of <domain>.
+        """
+
+        # Start indices (0-based)
+        start1 = self._start_in_1(full_map, domain)
+        start = [s1-1 for s1 in start1]
+        if any([s<0 for s in start]):
+            raise ValueError("Something is wrong with start indices")
+
+        # Step sizes
+        strides = self._strides(full_map, domain)
+
+        # Stop indices: add domain size to start
+        stop = start[:]
+        dom = self.namelist["domains"]
+        # -1 here to have #cells not #edges
+        stop[0] = start[0] + (dom["e_we"][domain-1]-1)*strides[0]
+        stop[1] = start[1] + (dom["e_sn"][domain-1]-1)*strides[1]
+
+        # Now sample
+        # Sampling indices as sequence
+        xseq = list(range(start[0], stop[0], strides[0]))
+        yseq = list(range(start[1], stop[1], strides[1]))
+        # Sampling indices as grid
+        xx, yy = np.meshgrid(xseq, yseq)
+
+        # Sample: ATTENTION: axis 0 refers to the latitude!
+        sampled_map = full_map[yy, xx]
+
+        return sampled_map
+
+    def _start_in_1(self, full_map, domain):
+        """Start index of domain in full_map, 1-based"""
+        domains = self._parent_domains(domain)
+        domains.append(domain)
+        istart = 1 # i refers to x (as e_we)
+        jstart = 1 # j refers to y (as e_sn)
+        for ndom in range(len(domains)-1):
+            this_domain = domains[ndom+1]
+            this_parent = domains[ndom]
+            istart += (self.namelist["domains"]["i_parent_start"][this_domain-1]-1) \
+                      *self._strides(full_map, this_parent)[0]
+            jstart += (self.namelist["domains"]["j_parent_start"][this_domain-1]-1) \
+                      *self._strides(full_map, this_parent)[1]
+        return [istart, jstart]
+
+    def _strides(self, full_map, domain):
+        """Step size in <full_map> for <domain>
+
+        I.e. the product of the parent grid ratios between
+        finest domain and <domain>
+        """
+        dom = self.namelist["domains"]
+        # On which resolution is full_map? stride of domain 1:
+        # (Attention: index 1 refers to e_we, index 0 refers to e_sn)
+        stride1_x = full_map.shape[1] / (dom["e_we"][0]-1)
+        stride1_y = full_map.shape[0] / (dom["e_sn"][0]-1)
+        # stride = product of parent grid ratios
+        if domain == 1:
+            ratio_dom = 1
+        else:
+            domains = self._parent_domains(domain)
+            domains.append(domain)
+            ratio_dom = np.array(dom["parent_grid_ratio"])[np.array(domains)-1].prod()
+        # round() converts to int more safely. Still assumes the
+        # namelist and regions file are compatible, if not, this
+        # might return wrong results.
+        stride_x = int(round(stride1_x/ratio_dom))
+        stride_y = int(round(stride1_y/ratio_dom))
+
+        return (stride_x, stride_y)
+
+    def _parent_domains(self, domain):
+        """Figure out the domain parents of this domain."""
+
+        if domain == 1:
+            # No parents
+            return []
+        else:
+            # Add parent domain to list
+            # "-1" because domain numbers start at 1
+            parent = self.namelist["domains"]["parent_id"][domain-1]
+            # Add parents of parent to list
+            parents_of_parent = self._parent_domains(parent)
+            parents_of_parent.append(parent)
+            # Done
+            return parents_of_parent
+
+
+    # Obsolete:
+    #def _get_ntasks_max(self):
+    #    """Get the number of tasks that can be spawned from this process."""
+    #    os_environ_ntask_var = self.settings["os_environ_ntask_var"]
+    #    if os_environ_ntask_var in os.environ.keys():
+    #        # I believe 1 process is taken up by this one, so -1
+    #        ntasks = int(os.environ[os_environ_ntask_var]) - 1
+    #    else:
+    #        ntasks = int(self.settings["ntasks_max_default"])
+    #        msg = "%s not detected in " % os_environ_ntask_var + \
+    #              "environment variables. Setting ntasks=%d" % ntasks
+    #        logging.warning(msg)
+    #
+    #    return ntasks
+    
+    
+if __name__ == "__main__":
+    pass
diff --git a/da/pipelines/pipeline_wrfchem.py b/da/pipelines/pipeline_wrfchem.py
new file mode 100755
index 0000000000000000000000000000000000000000..2cb6a2d5ac81028b84a735245f701e465c04a26c
--- /dev/null
+++ b/da/pipelines/pipeline_wrfchem.py
@@ -0,0 +1,469 @@
+"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. 
+Users are recommended to contact the developers (wouter.peters@wur.nl) to receive
+updates of the code. See also: http://www.carbontracker.eu. 
+
+This program is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software Foundation, 
+version 3. This program is distributed in the hope that it will be useful, but 
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 
+
+You should have received a copy of the GNU General Public License along with this 
+program. If not, see <http://www.gnu.org/licenses/>."""
+#!/usr/bin/env python
+# pipeline.py
+
+"""
+.. module:: pipeline
+.. moduleauthor:: Wouter Peters 
+
+Revision History:
+
+2021-07-10   freum   Version for wrfchem
+This is the wrfchem version, which was started in 2019 and does not yet include
+all changes that were made to the original pipeline since then. They can probably
+be integrated easily, so that wrfchem can use the standard pipeline.
+Differences to original:
+- Everything related to "inversion.savestate.dir" is not included here
+- Instead of "if sample.get_samples_type() == 'flask': ", samples includes an
+  empty function write_sample_auxiliary
+
+File created on 06 Sep 2010.
+
+The pipeline module holds methods that execute consecutive tasks with each of the objects of the DA system. 
+
+"""
+import logging
+import os
+import sys
+import datetime
+import copy
+
+header = '\n\n    ***************************************   '
+footer = '    *************************************** \n  '
+
+
+def ensemble_smoother_pipeline(dacycle, platform, dasystem, samples, statevector, obsoperator, optimizer):
+    """ The main point of entry for the pipeline """
+    sys.path.append(os.getcwd())
+
+    samples = samples if isinstance(samples,list) else [samples]
+
+    logging.info(header + "Initializing current cycle" + footer)
+    start_job(dacycle, dasystem, platform, statevector, samples, obsoperator)
+
+    prepare_state(dacycle, statevector)
+
+    sample_state(dacycle, samples, statevector, obsoperator)
+
+    invert(dacycle, statevector, optimizer)
+
+    advance(dacycle, samples, statevector, obsoperator)
+
+    save_and_submit(dacycle, statevector)
+    logging.info("Cycle finished...exiting pipeline")
+
+
+
+def forward_pipeline(dacycle, platform, dasystem, samples, statevector, obsoperator):
+    """ The main point of entry for the pipeline """
+    sys.path.append(os.getcwd())
+
+    samples = samples if isinstance(samples,list) else [samples]
+
+    logging.info(header + "Initializing current cycle" + footer)
+    start_job(dacycle, dasystem, platform, statevector, samples, obsoperator)
+
+    if 'forward.savestate.exceptsam' in dacycle:
+        sam = (dacycle['forward.savestate.exceptsam'].upper() in ["TRUE","T","YES","Y"])
+    else:
+        sam = False
+
+    if 'forward.savestate.dir' in dacycle:
+        fwddir = dacycle['forward.savestate.dir']
+    else:
+        logging.debug("No forward.savestate.dir key found in rc-file, proceeding with self-constructed prior parameters")
+        fwddir = False
+
+    if 'forward.savestate.legacy' in dacycle:
+        legacy = (dacycle['forward.savestate.legacy'].upper() in ["TRUE","T","YES","Y"])
+    else:
+        legacy = False
+        logging.debug("No forward.savestate.legacy key found in rc-file")
+
+    if not fwddir:
+        # Simply make a prior statevector using the normal method
+        prepare_state(dacycle, statevector)#LU tutaj zamiast tego raczej to stworzenie nowej kowariancji i ensembli bo pozostale rzeczy sa na gorze i na doel.
+    else:
+        # Read prior information from another simulation into the statevector.
+        # This loads the results from another assimilation experiment into the current statevector
+
+        if sam:
+            filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate_%s.nc'%dacycle['time.start'].strftime('%Y%m%d'))
+            #filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate.nc')
+            statevector.read_from_file_exceptsam(filename, 'prior')
+        elif not legacy:
+            filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate_%s.nc'%dacycle['time.start'].strftime('%Y%m%d'))
+            statevector.read_from_file(filename, 'prior')
+        else:
+            filename = os.path.join(fwddir, dacycle['time.start'].strftime('%Y%m%d'), 'savestate.nc')
+            statevector.read_from_legacy_file(filename, 'prior')
+
+
+    # We write this "prior" statevector to the restart directory, so we can later also populate it with the posterior statevector
+    # Note that we could achieve the same by just copying the wanted forward savestate.nc file to the restart folder of our current
+    # experiment, but then it would already contain a posterior field as well which we will try to write in save_and_submit.
+    # This could cause problems. Moreover, this method allows us to read older formatted savestate.nc files (legacy) and write them into
+    # the current format through the "write_to_file" method.
+
+    savefilename = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['time.start'].strftime('%Y%m%d'))
+    statevector.write_to_file(savefilename, 'prior')
+
+    # Now read optimized fluxes which we will actually use to propagate through the system
+
+    if not fwddir:
+        # if there is no forward dir specified, we simply run forward with unoptimized prior fluxes in the statevector
+        logging.info("Running forward with prior savestate from: %s"%savefilename)
+
+    else:
+        # Read posterior information from another simulation into the statevector.
+        # This loads the results from another assimilation experiment into the current statevector
+
+        if sam:
+            statevector.read_from_file_exceptsam(filename, 'opt')
+        elif not legacy:
+            statevector.read_from_file(filename, 'opt')
+        else:
+            statevector.read_from_legacy_file(filename, 'opt')
+
+        logging.info("Running forward with optimized savestate from: %s"%filename)
+
+    # Finally, we run forward with these parameters
+    advance(dacycle, samples, statevector, obsoperator)
+
+    # In save_and_submit, the posterior statevector will be added to the savestate.nc file, and it is added to the copy list.
+    # This way, we have both the prior and posterior data from another run copied into this assimilation, for later analysis.
+
+    save_and_submit(dacycle, statevector)
+
+    logging.info("Cycle finished...exiting pipeline")
+####################################################################################################
+
+def analysis_pipeline(dacycle, platform, dasystem, samples, statevector):
+    """ Main entry point for analysis of ctdas results """
+
+    from da.analysis.cteco2.expand_fluxes import save_weekly_avg_1x1_data, save_weekly_avg_state_data, save_weekly_avg_tc_data, save_weekly_avg_ext_tc_data, save_weekly_avg_agg_data
+    from da.analysis.cteco2.expand_molefractions import write_mole_fractions
+    from da.analysis.cteco2.summarize_obs import summarize_obs
+    from da.analysis.cteco2.time_avg_fluxes import time_avg
+
+    logging.info(header + "Starting analysis" + footer)
+
+    dasystem.validate()
+    dacycle.dasystem = dasystem
+    dacycle.daplatform = platform
+    dacycle.setup()
+    statevector.setup(dacycle)
+
+    logging.info(header + "Starting mole fractions" + footer)
+
+    write_mole_fractions(dacycle)
+    summarize_obs(dacycle['dir.analysis'])
+
+    logging.info(header + "Starting weekly averages" + footer)
+
+    save_weekly_avg_1x1_data(dacycle, statevector)
+    save_weekly_avg_state_data(dacycle, statevector)
+    save_weekly_avg_tc_data(dacycle, statevector)
+    save_weekly_avg_ext_tc_data(dacycle)
+    save_weekly_avg_agg_data(dacycle,region_aggregate='transcom')
+    save_weekly_avg_agg_data(dacycle,region_aggregate='transcom_extended')
+    save_weekly_avg_agg_data(dacycle,region_aggregate='olson')
+    save_weekly_avg_agg_data(dacycle,region_aggregate='olson_extended')
+    save_weekly_avg_agg_data(dacycle,region_aggregate='country')
+
+    logging.info(header + "Starting monthly and yearly averages" + footer)
+
+    time_avg(dacycle,'flux1x1')
+    time_avg(dacycle,'transcom')
+    time_avg(dacycle,'transcom_extended')
+    time_avg(dacycle,'olson')
+    time_avg(dacycle,'olson_extended')
+    time_avg(dacycle,'country')
+
+    logging.info(header + "Finished analysis" + footer)
+
+
+
+def archive_pipeline(dacycle, platform, dasystem):
+    """ Main entry point for archiving of output from one disk/system to another """
+
+    if not 'task.rsync' in dacycle:
+        logging.info('rsync task not found, not starting automatic backup...')
+        return
+    else:
+        logging.info('rsync task found, starting automatic backup...')
+
+    for task in dacycle['task.rsync'].split():
+        sourcedirs = dacycle['task.rsync.%s.sourcedirs'%task]
+        destdir = dacycle['task.rsync.%s.destinationdir'%task]
+
+        rsyncflags = dacycle['task.rsync.%s.flags'%task]
+
+        # file ID and names
+        jobid = dacycle['time.end'].strftime('%Y%m%d')
+        targetdir = os.path.join(dacycle['dir.exec'])
+        jobfile = os.path.join(targetdir, 'jb.rsync.%s.%s.jb' % (task,jobid) )
+        logfile = os.path.join(targetdir, 'jb.rsync.%s.%s.log' % (task,jobid) )
+        # Template and commands for job
+        jobparams = {'jobname':"r.%s" % jobid, 'jobnodes': '1', 'jobtime': '1:00:00', 'joblog': logfile, 'errfile': logfile}
+
+        if platform.ID == 'cartesius':
+            jobparams['jobqueue'] = 'staging'
+
+        template = platform.get_job_template(jobparams)
+        for sourcedir in sourcedirs.split():
+            execcommand = "\nrsync %s %s %s\n" % (rsyncflags, sourcedir,destdir,)
+            template += execcommand
+
+        # write and submit
+        platform.write_job(jobfile, template, jobid)
+        jobid = platform.submit_job(jobfile, joblog=logfile)
+
+
+
+def start_job(dacycle, dasystem, platform, statevector, samples, obsoperator):
+    """ Set up the job specific directory structure and create an expanded rc-file """
+
+    dasystem.validate()
+    dacycle.dasystem = dasystem
+    dacycle.daplatform = platform
+    dacycle.setup()
+    #statevector.dacycle = dacycle # also embed object in statevector so it can access cycle information for I/O etc
+    #samples.dacycle = dacycle # also embed object in samples object so it can access cycle information for I/O etc
+    #obsoperator.dacycle = dacycle # also embed object in obsoperator object so it can access cycle information for I/O etc
+    obsoperator.setup(dacycle)  # Setup Observation Operator
+    statevector.setup(dacycle)
+
+
+def prepare_state(dacycle, statevector):
+    """ Set up the input data for the forward model: obs and parameters/fluxes"""
+
+    # We now have an empty statevector object that we need to populate with data. If this is a continuation from a previous cycle, we can read
+    # the previous statevector values from a NetCDF file in the restart directory. If this is the first cycle, we need to populate the statevector
+    # with new values for each week. After we have constructed the statevector, it will be propagated by one cycle length so it is ready to be used
+    # in the current cycle
+
+    logging.info(header + "starting prepare_state" + footer)
+
+    if not dacycle['time.restart']:
+
+        # Fill each week from n=1 to n=nlag with a new ensemble
+        for n in range(statevector.nlag):
+            date = dacycle['time.start'] + datetime.timedelta(days=(n + 0.5) * int(dacycle['time.cycle']))
+            cov = statevector.get_covariance(date, dacycle)
+            statevector.make_new_ensemble(n, cov)
+
+    else:
+        # Read the statevector data from file
+        #saved_sv = os.path.join(dacycle['dir.restart.current'], 'savestate.nc')
+        saved_sv = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['da.restart.tstamp'].strftime('%Y%m%d'))
+        statevector.read_from_file(saved_sv) # by default will read "opt"(imized) variables, and then propagate
+
+        # Now propagate the ensemble by one cycle to prepare for the current cycle
+        statevector.propagate(dacycle)
+
+    # Finally, also write the statevector to a file so that we can always access the a-priori information
+    current_sv = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['time.start'].strftime('%Y%m%d'))
+    statevector.write_to_file(current_sv, 'prior')  # write prior info
+
+
+
+def sample_state(dacycle, samples, statevector, obsoperator):
+    """ Sample the filter state for the inversion """
+
+    # Before a forecast step, save all the data to a save/tmp directory so we can later recover it before the propagation step.
+    # This is especially important for:
+    #  (i) The transport model restart data which holds the background mole fractions. This is needed to run the model one cycle forward
+    #  (ii) The random numbers (or the seed for the random number generator) so we can recreate the ensembles if needed
+
+    #status   = dacycle.MoveSaveData(io_option='store',save_option='partial',filter=[])
+    #msg     = "All restart data have been copied to the save/tmp directory for future use"    ; logging.debug(msg)
+    logging.info(header + "starting sample_state" + footer)
+    nlag = int(dacycle['time.nlag'])
+    logging.info("Sampling model will be run over %d cycles" % nlag)
+
+    obsoperator.get_initial_data()
+
+    for lag in range(nlag):
+        logging.info(header + ".....Ensemble Kalman Filter at lag %d" % (lag + 1))
+
+        ############# Perform the actual sampling loop #####################
+
+        sample_step(dacycle, samples, statevector, obsoperator, lag)
+
+        logging.debug("statevector now carries %d samples" % statevector.nobs)
+
+
+
+def sample_step(dacycle, samples, statevector, obsoperator, lag, advance=False):
+    """ Perform all actions needed to sample one cycle """
+
+    # First set up the information for time start and time end of this sample
+    dacycle.set_sample_times(lag)
+
+    startdate = dacycle['time.sample.start']
+    enddate = dacycle['time.sample.end']
+    dacycle['time.sample.window'] = lag
+    dacycle['time.sample.stamp'] = "%s_%s" % (startdate.strftime("%Y%m%d%H"), enddate.strftime("%Y%m%d%H"))
+
+    logging.info("New simulation interval set : ")
+    logging.info("                  start date : %s " % startdate.strftime('%F %H:%M'))
+    logging.info("                  end   date : %s " % enddate.strftime('%F %H:%M'))
+    logging.info("                  file  stamp: %s " % dacycle['time.sample.stamp'])
+
+
+    # Implement something that writes the ensemble member parameter info to file, or manipulates them further into the
+    # type of info needed in your transport model
+
+    statevector.write_members_to_file(lag, dacycle['dir.input'], obsoperator=obsoperator)
+
+    for sample in samples:
+
+        sample.setup(dacycle)
+
+        # Read observations + perform observation selection
+        sample.add_observations()
+
+        # Add model-data mismatch to all samples, this *might* use output from the ensemble in the future??
+        sample.add_model_data_mismatch(dacycle.dasystem['obs.sites.rc'])
+
+        sampling_coords_file = os.path.join(dacycle['dir.input'], sample.get_samples_type()+'_coordinates_%s.nc' % dacycle['time.sample.stamp'])
+        logging.debug('sampling_coords_file = %s' % sampling_coords_file)
+        sample.write_sample_coords(sampling_coords_file)
+
+        # Write filename to dacycle, and to output collection list
+        dacycle['ObsOperator.inputfile.'+sample.get_samples_type()] = sampling_coords_file
+
+    del sample
+
+    # Run the observation operator
+    obsoperator.run_forecast_model(samples)
+
+
+    # Read forecast model samples that were written to NetCDF files by each member. Add them to the exisiting
+    # Observation object for each sample loop. This data fill be written to file in the output folder for each sample cycle.
+
+    # We retrieve all model samples from one output file written by the ObsOperator. If the ObsOperator creates
+    # one file per member, some logic needs to be included to merge all files!!!
+
+    for i in range(len(samples)):
+        if os.path.exists(dacycle['ObsOperator.inputfile.'+samples[i].get_samples_type()]):
+            samples[i].add_simulations(obsoperator.simulated_file[i])
+            logging.debug("Simulated values read, continuing pipeline")
+        else:
+            logging.warning("No simulations added, because sample input file does not exist")
+
+    # Now add the observations that need to be assimilated to the statevector.
+    # Note that obs will only be added to the statevector if either this is the first step (restart=False), or lag==nlag
+    # This is to make sure that the first step of the system uses all observations available, while the subsequent
+    # steps only optimize against the data at the front (lag==nlag) of the filter. This way, each observation is used only
+    # (and at least) once # in the assimilation
+
+
+    if not advance:
+        logging.debug('time.restart = %s' %dacycle['time.restart'])
+        if dacycle['time.restart'] == False or lag == int(dacycle['time.nlag']) - 1:
+            statevector.obs_to_assimilate += (copy.deepcopy(samples),)
+            for sample in samples:
+                statevector.nobs += sample.getlength()
+            del sample
+            logging.info('nobs = %i' %statevector.nobs)
+            logging.debug("Added samples from the observation operator to the assimilated obs list in the statevector")
+
+        else:
+            statevector.obs_to_assimilate += (None,)
+
+
+
+def invert(dacycle, statevector, optimizer):
+    """ Perform the inverse calculation """
+    logging.info(header + "starting invert" + footer)
+
+    if statevector.nobs <= 1: #== 0:
+        logging.info('List with observations to assimilate is empty, skipping invert step and continuing without statevector update...')
+        return
+
+    dims = (int(dacycle['time.nlag']),
+            int(dacycle['da.optimizer.nmembers']),
+            int(dacycle.dasystem['nparameters']),
+            statevector.nobs)
+
+    if 'opt.algorithm' not in dacycle.dasystem:
+        logging.info("There was no minimum least squares algorithm specified in the DA System rc file (key : opt.algorithm)")
+        logging.info("...using serial algorithm as default...")
+        optimizer.set_algorithm('Serial')
+    elif dacycle.dasystem['opt.algorithm'] == 'serial':
+        logging.info("Using the serial minimum least squares algorithm to solve ENKF equations")
+        optimizer.set_algorithm('Serial')
+    elif dacycle.dasystem['opt.algorithm'] == 'bulk':
+        logging.info("Using the bulk minimum least squares algorithm to solve ENKF equations")
+        optimizer.set_algorithm('Bulk')
+
+    optimizer.setup(dims)
+    optimizer.state_to_matrix(statevector)
+
+    diagnostics_file = os.path.join(dacycle['dir.output'], 'optimizer.%s.nc' % dacycle['time.start'].strftime('%Y%m%d'))
+
+    optimizer.write_diagnostics(diagnostics_file, 'prior')
+    optimizer.set_localization(dacycle['da.system.localization'])
+
+    if optimizer.algorithm == 'Serial':
+        optimizer.serial_minimum_least_squares()
+    else:
+        optimizer.bulk_minimum_least_squares()
+
+    optimizer.matrix_to_state(statevector)
+    optimizer.write_diagnostics(diagnostics_file, 'optimized')
+
+
+
+def advance(dacycle, samples, statevector, obsoperator):
+    """ Advance the filter state to the next step """
+
+    # This is the advance of the modeled CO2 state. Optionally, routines can be added to advance the state vector (mean+covariance)
+
+    # Then, restore model state from the start of the filter
+    logging.info(header + "starting advance" + footer)
+    logging.info("Sampling model will be run over 1 cycle")
+
+    obsoperator.get_initial_data()
+
+    sample_step(dacycle, samples, statevector, obsoperator, 0, True)
+
+    dacycle.restart_filelist.extend(obsoperator.restart_filelist)
+    dacycle.output_filelist.extend(obsoperator.output_filelist)
+    logging.debug("Appended ObsOperator restart and output file lists to dacycle for collection ")
+
+    for sample in samples:
+        dacycle.output_filelist.append(dacycle['ObsOperator.inputfile.'+sample.get_samples_type()])
+        logging.debug("Appended Observation filename to dacycle for collection: %s"%(dacycle['ObsOperator.inputfile.'+sample.get_samples_type()]))
+
+        sampling_coords_file = os.path.join(dacycle['dir.input'], sample.get_samples_type()+'_coordinates_%s.nc' % dacycle['time.sample.stamp'])
+        if os.path.exists(sampling_coords_file):
+            outfile = os.path.join(dacycle['dir.output'], sample.get_samples_type() + '_auxiliary_%s.nc' % dacycle['time.sample.stamp'])
+            sample.write_sample_auxiliary(outfile)
+        else: logging.warning("Sample auxiliary output not written, because input file does not exist (no samples found in obspack)")
+    del sample
+
+
+
+def save_and_submit(dacycle, statevector):
+    """ Save the model state and submit the next job """
+    logging.info(header + "starting save_and_submit" + footer)
+
+    filename = os.path.join(dacycle['dir.restart'], 'savestate_%s.nc' % dacycle['time.start'].strftime('%Y%m%d'))
+    statevector.write_to_file(filename, 'opt')
+
+    dacycle.output_filelist.append(filename)
+    dacycle.finalize()
diff --git a/da/rc/wrfchem/dasystem_wrfchem.rc b/da/rc/wrfchem/dasystem_wrfchem.rc
new file mode 100644
index 0000000000000000000000000000000000000000..fc85dcf7625b42d1b5d79cb2c66fc5370aa29b21
--- /dev/null
+++ b/da/rc/wrfchem/dasystem_wrfchem.rc
@@ -0,0 +1,77 @@
+! CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. 
+! Users are recommended to contact the developers (wouter.peters@wur.nl) to receive
+! updates of the code. See also: http://www.carbontracker.eu. 
+!
+! This program is free software: you can redistribute it and/or modify it under the
+! terms of the GNU General Public License as published by the Free Software Foundation, 
+! version 3. This program is distributed in the hope that it will be useful, but 
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
+! FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 
+!
+! You should have received a copy of the GNU General Public License along with this 
+! program. If not, see <http://www.gnu.org/licenses/>. 
+
+!!! Info for the CarbonTracker data assimilation system
+
+!datadir         : /projects/0/ctdas/input/ctdas_2012/
+
+
+! For ObsPack
+! For WRF, this is only a dummy until sampling point observations is
+! implemented
+!obspack.input.id   : obspack_co2_1_GLOBALVIEWplus_v2.1_2016-09-02
+!obspack.input.dir  : ${datadir}/obspacks/${obspack.input.id}
+!obs.sites.rc       : ${obspack.input.dir}/summary/sites_weights_Dec2016.rc
+obs.sites.rc       : dummy
+
+! For column data
+! sample.in.ctdas is obsolete, should be removed
+sample.in.ctdas    : False
+! obs.column.rc here only contains "obs.rejection.threshold : <N>"
+obs.column.rc		   : /path/to/xco2_weights.rc
+obs.column.input.dir 	    : /dir/with/column/observation/files
+obs.column.ncfile    	    : column_observation_file_pattern_<YYYYMMDD>.nc
+
+! Selection criteria based on variables stored in data file
+obs.column.selection.variables   : quality_flag
+obs.column.selection.criteria    : == 0
+
+! Any value but "parametrization" defaults to model uncertainty of 1
+! (in the units of the observations, so e.g. 1 ppm for CO2)
+mdm.calculation     : default
+
+! Covariances
+! Scaling factor factor sigma, no covariances
+! A value of 1 means 100% prior flux uncertainty.
+! For one flux process:
+! sigma_scale     : 1
+! For multiple flux processes (see n_emis_proc below):
+sigma_scale_1     : 0.3
+sigma_scale_2     : 0.5
+sigma_scale_3     : 0.5
+
+! Uncomment sigma_offset for optimizing a constant offset of WRF initial
+! and boundary conditions. Unit as in observations, so e.g. ppm for CO2.
+! This is only allowed in runs with only one cycle, because in
+! subsequent cycles, the initial conditions couldn't be optimized in a
+! consistent way.
+! sigma_offset    : 0.5
+
+! Seed for numpy random number generator?
+!random.seed     : 4385
+!random.seed.init: ${datadir}/randomseedinit.pickle
+random.seed.init: /path/to/randomseedinit.pickle
+
+
+!!!!!!!!!!!!!!!!!! STATEVECTOR  !!!!!!!!!!!!!!!!!!
+! nparameters = number of flux parameters plus 1 in case an offset is
+! optimized
+! Example:
+! For 25 state vector elements and 3 processes (e.g. GPP, respiration,
+! anthropogenic emissions), set nparameters to 75. If, in addition, an
+! initial/boundary condition offset is optimized by setting
+! sigma_offset, set nparameters to 76.
+
+nparameters       : 75
+n_emis_proc       : 3
+regionsfile       : /path/to/regionsfile.nc
diff --git a/da/rc/wrfchem/obsoper_wrfchem.rc b/da/rc/wrfchem/obsoper_wrfchem.rc
new file mode 100644
index 0000000000000000000000000000000000000000..d0cb1cc9be071069a7989acd8b5a1d25030f3d9c
--- /dev/null
+++ b/da/rc/wrfchem/obsoper_wrfchem.rc
@@ -0,0 +1,141 @@
+! CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. 
+! Users are recommended to contact the developers (wouter.peters@wur.nl) to receive
+! updates of the code. See also: http://www.carbontracker.eu. 
+!
+! This program is free software: you can redistribute it and/or modify it under the
+! terms of the GNU General Public License as published by the Free Software Foundation, 
+! version 3. This program is distributed in the hope that it will be useful, but 
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
+! FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 
+!
+! You should have received a copy of the GNU General Public License along with this 
+! program. If not, see <http://www.gnu.org/licenses/>. 
+
+
+! source_dir
+! Directory like WRF/run/ containing everything that is necessary to run WRF,
+! having already run real.exe:
+! - wrf.exe
+! - namelist.input
+! - wrfbdy_d01 (already containing values for each ensemble member)
+! - wrfinput_dXX (already containing values for each ensemble member)
+! - wrfchemi_* (containing at the prior fluxes in field flux_optim and
+!   flux_fix (if defined), and fields for ensemble members)
+
+source_dir : /dir/like/WRF/run/with/all/inputs/
+
+! run_dir
+! The directory where all WRF runs will be performed.
+! Files from source_dir will be copied/linked here as needed.
+run_dir : /dir/for/running/WRF/within/CTDAS/
+
+! tracer and flux to optimize (as in ensemble tracer definitions in WRF, but
+! without the _NNN suffix for the member number)
+tracer_optim : CO2
+flux_optim : E_CO2
+
+! Fixed flux to add to fluxes to optimize (comment if there are none)
+! flux_fix : E_FIX
+
+! obs.column.footprint_samples_dim (do not change)
+! For high resolution simulations, the satellite footprint can be bigger
+! than one model pixel. If this option is 1, the model atmosphere is
+! sampled at the center coordinates of the observation. If this option
+! is > 1, the model atmosphere is sampled at multiple points within the
+! footprint corners and the result is averaged. The sampling points are
+! aranged as an n x n rectangle.
+! Example: if obs.column.footprint_samples_dim=3, the model is sampled
+! at 3*3=9 points in the footprint area.
+! Must be set to 1. Code for larger values is in preparation.
+obs.column.footprint_samples_dim : 1
+
+
+! Option to not run WRF in case output files are already present
+! for the current time period. This is a development option that
+! is useful for testing something in CTDAS after the transport
+! model is run.
+! True/False (default if not set: False)
+! no_rerun_wrf : False
+
+! Option to save a copy of wrfout*, wrfrst* and rsl* files each time
+! WRF is run. This is another option for development/debugging and not
+! for production. The purpose is to rerun CTDAS without actually running the
+! transport model, because WRF usually takes a long time to run. It can also
+! be used to check wrfout contents of a cycle, because they get overwritten
+! by subsequent steps. I use it to debug the optimization step without
+! rerunning wrf for the da step for each try. So far this recovery step
+! needs to be done manually, though. So for a new run I copy the wrfout 
+! files back to the WRF run directory, and set no_rerun_wrf (above) to
+! True. Then I start CTDAS and it will see that the wrfout for the period
+! already exist and skip running WRF. Note that this will also happen in
+! the advance step, because wrfout files for its period already exist. If
+! you want CTDAS to do a proper advance step, you need to wait for the
+! optimization to start, then while it is still running move the wrfout files
+! out of the run directory again. Then when the advance step starts, it will
+! rerun WRF because it doesn't find wrfout files.
+save_wrf_step : False
+
+! WRF-Chem and sampling its output are implemented as subprocesses (job
+! steps). All remaining options in this file deal with starting them.
+! These options and definitions could be made part of the platform
+! object in a future update. However, in the original CTDAS, running the
+! transport model TM5 is also controlled by TM5-specific options.
+! The options here work with the SLURM scheduler on the Dutch national
+! supercomputer cartesius.
+
+! The processes are started with the python module subprocess, which
+! takes a shell command as option. The shell command for WRF is:
+! run_command + " " (+ run_ntask_opt + ntasks) (+ " " + run_t_opt + run_t) + " ./wrf.exe >& log/wrf.log"
+! The shell command for python scripts is:
+! run_command_1task + " " (+ run_mem_opt + <mem_str> + mem_unit_str_MB) (+ " " + run_t_opt + run_t) + " python <python skript and options>"
+! Variables enclosed by "<>" are computed online. The others are set
+! below. Options in brackets are only appended to the command string if
+! they are not empty in the settings below.
+
+! Command to start external process
+run_command : srun
+
+! Some python scripts are launched that require only 1 task.
+! For this case, the scheduler SLURM requires to specifically request
+! -N1 because otherwise slurm apparently sometimes tries to allocate
+! one task to more than one node. See here:
+! https://stackoverflow.com/questions/24056961/running-slurm-script-with-multiple-nodes-launch-job-steps-with-1-task
+run_command_1task : srun -n1 -N1
+
+! Define how to pass options to processes started with run_command or
+! run_command_1task
+! Name of the option of run_command that defines requested memory
+! (total, not per cpu). If this is empty, memory limit will not be
+! specified
+run_mem_opt : --mem=
+
+! The operator computes memory in MB, might have to add a string
+mem_unit_str_MB : M
+
+! Name of the option of run_command that defines requested wallclock time
+! If this is empty, wallclock limit will not be specified.
+run_t_opt : -t
+
+! Name of the option of run_command that defines number of requested tasks
+! If this is empty, number of tasks  will not be specified
+run_ntask_opt : -n
+
+! Time that a subprocess should request (this is probably not important
+! because it's run within the same resource allocation as the parent -
+! not tested working without though)
+run_t : 0:20:00
+
+! Number of tasks that can run on one node
+! This is used for computing how much memory to allocate per process of
+! calls wrfout_sampler.
+cpus_per_node : 24
+
+! Available memory per node
+! mem per node: On cartesius technically 64, but at least in interactive
+! shells on cartesius a bit of a buffer is needed. The value here was
+! determined with calls like "srun --mem=61G -n1 sleep 1" inside an
+! interactive shell.
+mem_per_node : 60
+
+
+
diff --git a/da/statevectors/statevector_wrfchem.py b/da/statevectors/statevector_wrfchem.py
new file mode 100644
index 0000000000000000000000000000000000000000..6236b22edf39732e2f25e62bf51c24967471ca9c
--- /dev/null
+++ b/da/statevectors/statevector_wrfchem.py
@@ -0,0 +1,332 @@
+"""CarbonTracker Data Assimilation Shell (CTDAS) Copyright (C) 2017 Wouter Peters. 
+Users are recommended to contact the developers (wouter.peters@wur.nl) to receive
+updates of the code. See also: http://www.carbontracker.eu. 
+
+This program is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software Foundation, 
+version 3. This program is distributed in the hope that it will be useful, but 
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 
+
+You should have received a copy of the GNU General Public License along with this 
+program. If not, see <http://www.gnu.org/licenses/>."""
+#!/usr/bin/env python
+# ct_statevector_tools.py
+
+"""
+.. module:: statevector
+.. moduleauthor:: Friedemann Reum
+
+Revision History:
+File created 2019.
+
+This module is an extension to any CTDAS statevector that overwrites
+write_members_to_file to produce output in the format required by WRF-Chem.
+Thus, it can inherit from any other
+state vector. In this reference implementation, it inherits from 
+the baseclass statevector.
+    * :class:`~da.baseclasses.statevector.StateVector`
+    * :class:`~da.baseclasses.statevector.EnsembleMember`
+
+As usual, specific implementations of StateVector objects are done through inheritance form these baseclasses. An example of designing 
+your own baseclass StateVector we refer to :ref:`tut_chapter5`.
+
+.. autoclass:: da.baseclasses.statevector.StateVector 
+
+.. autoclass:: da.baseclasses.statevector.EnsembleMember 
+
+"""
+
+import os
+import logging
+import numpy as np
+from datetime import timedelta
+import da.tools.io4 as io
+from da.statevectors.statevector_baseclass import StateVector, EnsembleMember
+
+identifier = 'Statevector for WRFChem'
+version = '0.0'
+
+################### Begin Class WRFChemStateVector ###################
+class WRFChemStateVector(StateVector):
+    """ 
+    This module is an extension to any CTDAS statevector that overwrites
+    write_members_to_file to produce output in the format required by WRF-Chem.
+    Thus, it can inherit from any other state vector. In the current
+    implementation, it inherits from CO2GriddedStateVector.
+
+    .. automethod:: da.baseclasses.statevector.StateVector.write_members_to_file
+
+    """
+
+    def __init__(self):
+        self.ID = identifier
+        self.version = version
+        logging.info('Statevector object initialized: %s (%s)' % 
+                     (self.ID, self.version))
+        
+    def setup(self, dacycle):
+        """
+        Copied from baseclasses.statevector and
+        - Replaced transcom region with a dummy.
+        - Added do_offset
+        - Added overlaying processes
+        """
+
+        self.nlag = int(dacycle['time.nlag'])
+        self.nmembers = int(dacycle['da.optimizer.nmembers'])
+        self.nparams = int(dacycle.dasystem['nparameters'])
+        self.do_offset = "sigma_offset" in list(dacycle.dasystem.keys())
+        # >>> Overlaying processes:
+        self.n_emis_proc = int(dacycle.dasystem['n_emis_proc']) 
+        # Get number of flux parameters per process
+        nparams_proc = float(self.nparams - int(self.do_offset)) / float(self.n_emis_proc)
+        if int(nparams_proc)==nparams_proc:
+            self.nparams_proc = int(nparams_proc)
+        else:
+            raise ValueError("Number of emission processes doesn't divide number of flux parameters")
+        # <<< Overlaying processes
+        
+        self.nobs = 0
+        
+        self.obs_to_assimilate = ()  # empty containter to hold observations to assimilate later on
+
+        # These list objects hold the data for each time step of lag in the system. Note that the ensembles for each time step consist 
+        # of lists of EnsembleMember objects, we define member 0 as the mean of the distribution and n=1,...,nmembers as the spread.
+
+        self.ensemble_members = list(range(self.nlag))
+
+        for n in range(self.nlag):
+            self.ensemble_members[n] = []
+
+
+        # This specifies the file to read with the gridded mask at 1x1 degrees. Each gridbox holds a number that specifies the parametermember
+        #  that maps onto it. From this map, a dictionary is created that allows a reverse look-up so that we can map parameters to a grid.
+
+        mapfile = os.path.join(dacycle.dasystem['regionsfile'])
+        ncf = io.ct_read(mapfile, 'read')
+        self.gridmap = ncf.get_variable('regions').astype(int)
+        # >>> edit freum: Not using transcom regions mapping in regional setup with wrfchem
+        #self.tcmap = ncf.get_variable('transcom_regions')
+        # <<< edit freum
+        ncf.close()
+
+        #logging.debug("A TransCom  map on 1x1 degree was read from file %s" % dacycle.dasystem['regionsfile'])
+        #logging.debug("A parameter map on 1x1 degree was read from file %s" % dacycle.dasystem['regionsfile'])
+        logging.debug("A parameter map was read from file %s" % dacycle.dasystem['regionsfile'])
+
+        # Create a dictionary for state <-> gridded map conversions
+
+        nparams = self.gridmap.max()
+        
+        # >>> do_offset
+        # Check that nparams is compatible with the one
+        # given in dasystem rc-file
+        
+        # if nparams + int(self.do_offset) != self.nparams:
+        if nparams*self.n_emis_proc + int(self.do_offset) != self.nparams: # overlaying parameters
+            raise ValueError("Number of parameters in regionsfile incompatible with nparameters in dasystem-rcfile.")
+        # <<< do_offset
+
+        self.griddict = {}
+        for r in range(1, int(nparams) + 1):
+            sel = np.nonzero(self.gridmap.flatten() == r)
+            if len(sel[0]) > 0: 
+                self.griddict[r] = sel
+
+        logging.debug("A dictionary to map grids to states and vice versa was created")
+
+        # Create a matrix for state <-> TransCom conversions
+        # >>> edit freum
+        # Regional simulations (wrfchem): only dummy tcmatrix. Would only be used in analysis pipeline, which I don't run for wrfchem.
+
+        self.tcmatrix = np.zeros((self.nparams, 23), 'float') 
+
+        # for r in range(1, self.nparams + 1):
+        #     sel = np.nonzero(self.gridmap.flatten() == r)
+        #     if len(sel[0]) < 1: 
+        #         continue
+        #     else:
+        #         n_tc = set(self.tcmap.flatten().take(sel[0]))
+        #         if len(n_tc) > 1: 
+        #             logging.error("Parameter %d seems to map to multiple TransCom regions (%s), I do not know how to handle this" % (r, n_tc))
+        #             raise ValueError
+        #         self.tcmatrix[r - 1, int(n_tc.pop()) - 1] = 1.0
+
+        # logging.debug("A matrix to map states to TransCom regions and vice versa was created")
+        logging.debug("A dummy matrix to map states to TransCom regions and vice versa was created")
+        # <<< edit freum
+        
+        # Create a mask for species/unknowns
+
+        self.make_species_mask()
+
+    def make_new_ensemble(self, lag, covariancematrix=None):
+        """
+        Copied from baseclass statevector, but switched to
+        fresh initialization of each new ensemble (instead of
+        with previous lags) and handling of self.do_offset.
+
+        """
+
+        if covariancematrix is None:
+            covariancematrix = np.identity(self.nparams)
+
+        # Make a cholesky decomposition of the covariance matrix
+
+
+        try:
+            _, s, _ = np.linalg.svd(covariancematrix)
+        except:
+            s = np.linalg.svd(covariancematrix, full_matrices=1, compute_uv=0) #Cartesius fix
+        dof = np.sum(s) ** 2 / sum(s ** 2)
+        C = np.linalg.cholesky(covariancematrix)
+
+        logging.debug('Cholesky decomposition has succeeded ')
+        logging.info('Appr. degrees of freedom in covariance matrix is %s' % (int(dof)))
+
+
+        # Create mean values
+
+        newmean = np.ones(self.nparams, float) # standard value for a new time step is 1.0
+
+        # >>> do_offset
+        if self.do_offset:
+            newmean[-1] = 0.
+        # <<< do_offset
+        
+        # If this is not the start of the filter, average previous two optimized steps into the mix
+
+        # freum vvvv
+        
+        # Switch off the initialization with previously optimized state vector
+        # for my project
+        #if lag == self.nlag - 1 and self.nlag >= 3:
+        #    newmean += self.ensemble_members[lag - 1][0].param_values + \
+        #                                   self.ensemble_members[lag - 2][0].param_values
+        #    newmean = newmean / 3.0
+        
+        # freum ^^^^
+
+        # Create the first ensemble member with a deviation of 0.0 and add to list
+
+        newmember = EnsembleMember(0)
+        newmember.param_values = newmean.flatten()  # no deviations
+        self.ensemble_members[lag].append(newmember)
+
+        # Create members 1:nmembers and add to ensemble_members list
+
+        for member in range(1, self.nmembers):
+            rands = np.random.randn(self.nparams)
+
+            newmember = EnsembleMember(member)
+            newmember.param_values = np.dot(C, rands) + newmean
+            self.ensemble_members[lag].append(newmember)
+
+        logging.debug('%d new ensemble members were added to the state vector # %d' % (self.nmembers, (lag + 1)))
+
+
+    def write_members_to_file(self, lag, outdir, endswith='.nc', obsoperator=None):
+        """ 
+           :param: lag: Which lag step of the filter to write, must lie in range [1,...,nlag]
+           :param: outdir: Directory where to write files
+           :param: obsoperator: observationoperator object
+           :param: endswith: Optional label to add to the filename, default is simply .nc
+           :rtype: None
+
+           Write ensemble member information to file for transport model.
+           If observation operator is None or its ID is anything except
+           "wrfchem", the method of the baseclass is called. If the
+           observation operator ID is "wrfchem", the method of the
+           observation operator is called. The reason to implement it
+           there was originally to avoid teaching the statevector
+           object about WRF domains.
+        """
+        
+        # Decide what to call based on obsoperator
+        if obsoperator is None:
+            # logging.info("Calling statevector._write_members_to_file_standard")
+            # self._write_members_to_file_standard(lag, outdir, endswith)
+            logging.info("Calling super().write_members_to_file")
+            super().write_members_to_file(lag, outdir, endswith, obsoperator)
+        else:
+            if obsoperator.ID=='wrfchem':
+                logging.info("Calling obsoperator.write_members_to_file")
+                obsoperator.write_members_to_file(lag, outdir, self)
+            else: # call the standard method
+                # logging.info("Calling statevector._write_members_to_file_standard")
+                # self._write_members_to_file_standard(lag, outdir, endswith)
+                logging.info("Calling super().write_members_to_file")
+                super().write_members_to_file(lag, outdir, endswith, obsoperator)
+            
+            
+    # def _write_members_to_file_standard(self, lag, outdir, endswith='.nc'):
+    #     """
+    #     Copy of baseclass write_members_to_file. Documented in baseclass
+    #     statevector.
+    #     Obsolete, replaced with calling super().write_members_to_file()
+    #     """
+    # 
+    #     members = self.ensemble_members[lag]
+    # 
+    #     for mem in members:
+    #         filename = os.path.join(outdir, 'parameters.%03d%s' % (mem.membernumber, endswith))
+    #         ncf = io.CT_CDF(filename, method='create')
+    #         dimparams = ncf.add_params_dim(self.nparams)
+    #         dimgrid = ncf.add_latlon_dim()
+    # 
+    #         data = mem.param_values
+    # 
+    #         savedict = io.std_savedict.copy()
+    #         savedict['name'] = "parametervalues"
+    #         savedict['long_name'] = "parameter_values_for_member_%d" % mem.membernumber
+    #         savedict['units'] = "unitless"
+    #         savedict['dims'] = dimparams 
+    #         savedict['values'] = data
+    #         savedict['comment'] = 'These are parameter values to use for member %d' % mem.membernumber
+    #         ncf.add_data(savedict)
+    # 
+    #         griddata = self.vector2grid(vectordata=data)
+    # 
+    #         savedict = io.std_savedict.copy()
+    #         savedict['name'] = "parametermap"
+    #         savedict['long_name'] = "parametermap_for_member_%d" % mem.membernumber
+    #         savedict['units'] = "unitless"
+    #         savedict['dims'] = dimgrid 
+    #         savedict['values'] = griddata.tolist()
+    #         savedict['comment'] = 'These are gridded parameter values to use for member %d' % mem.membernumber
+    #         ncf.add_data(savedict)
+    # 
+    #         ncf.close()
+    # 
+    #         logging.debug('Successfully wrote data from ensemble member %d to file (%s) ' % (mem.membernumber, filename))
+
+    def get_covariance(self, date, dacycle):
+        """
+        Return a prior statevector covariance matrix
+        Here, the covariance matrix is diagonal and the values
+        are the squares of the "sigma" parameters in the
+        dasystem rc file.
+        """
+        
+        if self.n_emis_proc==1:
+            sigma = float(dacycle.dasystem['sigma_scale'])
+            covariancematrix = np.identity(self.nparams) * sigma**2
+        else:
+            # overlaying parameters:
+            covariancematrix = np.identity(self.nparams)
+            for nproc in range(1, self.n_emis_proc+1):
+                sigma_nproc = float(dacycle.dasystem['sigma_scale_' + str(nproc)])
+                ind = list(range((nproc-1)*self.nparams_proc, nproc*self.nparams_proc))
+                covariancematrix[ind, ind] = sigma_nproc**2
+        if self.do_offset:
+            covariancematrix[-1, -1] = float(dacycle.dasystem["sigma_offset"])**2
+            
+        
+        return covariancematrix
+
+################### End Class StateVector ###################
+
+if __name__ == "__main__":
+    pass
+
diff --git a/da/tools/wrfchem/__init__.py b/da/tools/wrfchem/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/da/tools/wrfchem/utilities.py b/da/tools/wrfchem/utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..157149ff00864e720276f86875b8b0ee96e154d9
--- /dev/null
+++ b/da/tools/wrfchem/utilities.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+
+Created on Wed Sep 18 16:03:02 2019
+
+@author: friedemann
+"""
+
+import os
+import glob
+import logging
+import subprocess
+import tempfile
+import copy
+import netCDF4 as nc
+import numpy as np
+
+class utilities(object):
+    """
+    Collection of utilities for wrfchem observation operator
+    that do not depend on other CTDAS modules
+    """
+    
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def get_slicing_ids(N, nproc, nprocs):
+        """
+        Purpose
+        -------
+        For parallel processing, figure out which samples to process
+        by this process.
+
+        Parameters
+        ----------
+        N : int
+            Length to slice
+        nproc : int
+            id of this process (0... nprocs-1)
+        nprocs : int
+            Number of processes that work on the task.
+
+        Output
+        ------
+        Slicing indices id0, id1
+        Usage
+        -----
+            ..code-block:: python
+
+            id0, id1 = get_slicing_ids(N, nproc, nprocs)
+            field[id0:id1, ...]
+        """
+
+        f0 = float(nproc)/float(nprocs)
+        id0 = int(np.floor(f0*N))
+
+        f1 = float(nproc+1)/float(nprocs)
+        id1 = int(np.floor(f1*N))
+
+        if id0==id1:
+            raise ValueError("id0==id1. Probably too many processes.")
+        return id0, id1
+
+
+
+    @classmethod
+    def cat_ncfiles(cls, path, in_arg, cat_dim, out_file, in_pattern=False, rm_original=True):
+        """
+        Combine output of all processes into 1 file
+        If in_pattern, a pattern is provided instead of a file list.
+        This has the advantage that it can be interpreted by the shell,
+        because there are problems with long argument lists.
+
+        This calls ncrcat from the nco library. If nco is not available,
+        rewrite this function. Note: I first tried to do this with
+        "cdo cat", but it messed up sounding_id
+        (see https://code.mpimet.mpg.de/boards/1/topics/908)
+        """
+
+        # To preserve dimension names, we start from one of the existing
+        # slice files instead of a new file.
+
+        # Do this in path to avoid long command line arguments and history
+        # entries in outfile.
+        cwd = os.getcwd()
+        os.chdir(path)
+
+        if in_pattern:
+            if not isinstance(in_arg, str):
+                raise TypeError("in_arg must be a string if in_pattern is True.")
+            file_pattern = in_arg
+            in_files = glob.glob(file_pattern)
+        else:
+            if isinstance(in_arg, list):
+                raise TypeError("in_arg must be a list if in_pattern is False.")
+            in_files = in_arg
+
+        if len(in_files) == 0:
+            logging.error("Nothing to do.")
+            # Change back to previous directory
+            os.chdir(cwd)
+            return
+
+        # Sorting is important!
+        in_files.sort()
+
+        # ncrcat needs total number of soundings, count
+        Nobs = 0
+        for f in in_files:
+            ncf = nc.Dataset(f, "r")
+            Nobs += len(ncf.dimensions[cat_dim])
+            ncf.close()
+
+        # Cat files
+        cmd_ = "ncrcat -h -O -d " + cat_dim + ",0,%d"%(Nobs-1)
+        if in_pattern:
+            cmd = cmd_ + " " + file_pattern + " " + out_file
+            # If PIPE is used here, it gets clogged, and the process
+            # stops without error message (see also 
+            # https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/)
+            # Hence, piping the output to a temporary file.
+            proc = subprocess.Popen(cmd, shell=True,
+                                    stdout=tempfile.TemporaryFile(),
+                                    stderr=tempfile.TemporaryFile())
+        else:
+            cmdsplt = cmd_.split() + in_files + [out_file]
+            proc = subprocess.Popen(cmdsplt, stdout=tempfile.TemporaryFile(), stderr=tempfile.TemporaryFile())
+            cmd = " ".join(cmdsplt)
+
+        proc.wait()
+
+        # This is probably useless since the output is piped to a
+        # tempfile.
+        retcode = cls.check_out_err(proc)
+
+        if retcode != 0:
+            msg = "Something went wrong in the sampling. Command: " + cmd
+            logging.error(msg)
+            raise OSError(msg)
+
+        # Delete slice files
+        if rm_original:
+            logging.info("Deleting slice files.")
+            for f in in_files:
+                os.remove(f)
+
+        logging.info("Sampled WRF output written to file.")
+
+        # Change back to previous directory
+        os.chdir(cwd)
+
+
+    @staticmethod
+    def check_out_err(process):
+        """Displays stdout and stderr, returns returncode of the
+        process.
+        """
+
+        # Get process messages
+        out, err = process.communicate()
+                
+        # Print output
+        def to_str(str_or_bytestr):
+            """If argument is of type str, return argument. If
+            argument is of type bytes, return decoded str"""
+            if isinstance(str_or_bytestr, str):
+                return str_or_bytestr
+            elif isinstance(str_or_bytestr, bytes):
+                return str(str_or_bytestr, 'utf-8')
+            else:
+                msg = "str_or_bytestr is " + str(type(str_or_bytestr)) + \
+                      ", should be str or bytestr."
+                raise TypeError(msg)
+
+        logging.debug("Subprocess output:")
+        if out is None:
+            logging.debug("No output.")
+        elif isinstance(out, list):
+            for line in out:
+                logging.debug(to_str(line.rstrip()))
+        else:
+            logging.debug(to_str(out.rstrip()))
+
+        # Handle errors
+        if process.returncode != 0:
+            logging.error("subprocess error")
+            logging.error("Returncode: %s", str(process.returncode))
+            logging.error("Message, if any:")
+            if not err is None:
+                for line in err:
+                    logging.error(line.rstrip())
+
+        return process.returncode
+    
+    @classmethod
+    def get_index_groups(cls, *args):
+        """
+        Input:
+            numpy arrays with 1 dimension or lists, all same length
+        Output:
+            Dictionary of lists of indices that have the same
+            combination of input values.
+        """
+    
+        try:
+            # If pandas is available, it makes a pandas DataFrame and
+            # uses its groupby-function.
+            import pandas as pd
+    
+            args_array = np.array(args).transpose()
+            df = pd.DataFrame(args_array)
+            groups = df.groupby(list(range(len(args)))).indices
+    
+        except ImportError:
+            # If pandas is not available, use an own implementation of groupby.
+            # Recursive implementation. It's fast.
+            args_array = np.array(args).transpose()
+            groups = cls._group(args_array)
+    
+        return groups
+
+    @classmethod
+    def _group(cls, a):
+        """
+        Reimplementation of pandas.DataFrame.groupby.indices because
+        py 2.7 on cartesius isn't compatible with pandas.
+        Unlike the pandas function, this always uses all columns of the
+        input array.
+    
+        Parameters
+        ----------
+        a : numpy.ndarray (2D)
+            Array of indices. Each row is a combination of indices.
+    
+        Returns
+        -------
+        groups : dict
+            The keys are the unique combinations of indices (rows of a),
+            the values are the indices of the rows of a equal the key.
+        """
+    
+        # This is a recursive function: It makes groups according to the
+        # first columnm, then calls itself with the remaining columns.
+        # Some index juggling.
+    
+        # Group according to first column
+        UI = list(set(a[:, 0]))
+        groups0 = dict()
+        for ui in UI:
+            # Key must be a tuple
+            groups0[(ui, )] = [i for i, x in enumerate(a[:, 0]) if x == ui]
+    
+        if a.shape[1] == 1:
+            # If the array only has one column, we're done
+            return groups0
+        else:
+            # If the array has more than one column, we group those.
+            groups = dict()
+            for ui in UI:
+                # Group according to the remaining columns
+                subgroups_ui = cls._group(a[groups0[(ui, )], 1:])
+                # Now the index juggling: Add the keys together and
+                # locate values in the original array.
+                for key in  list(subgroups_ui.keys()):
+                    # Get indices of bigger array
+                    subgroups_ui[key] = [groups0[(ui, )][n] for n in subgroups_ui[key]]
+                    # Add the keys together
+                    groups[(ui, ) + key] = subgroups_ui[key]
+    
+            return groups
+
+
+    @staticmethod
+    def apply_by_group(func, array, groups, grouped_args=None, *args, **kwargs):
+        """
+        Apply function 'func' to a numpy array by groups of indices.
+        'groups' can be a list of lists or a dictionary with lists as
+        values.
+    
+        If 'array' has more than 1 dimension, the indices in 'groups'
+        are for the first axis.
+    
+        If 'grouped_args' is not None, its members are added to
+        'kwargs' after slicing.
+    
+        *args and **kwargs are passed through to 'func'.
+    
+        Example:
+            apply_by_group(np.mean, np.array([0., 1., 2.]), [[0, 1], [2]])
+        Output:
+            array([0.5, 2. ])
+        """
+        
+        shape_in = array.shape
+        shape_out = list(shape_in)
+        shape_out[0] = len(groups)
+        array_out = np.ndarray(shape_out, dtype=array.dtype)
+    
+        if type(groups) == list:
+            # Make a dictionary
+            groups = {n: groups[n] for n in range(len(groups))}
+    
+        if not grouped_args is None:
+            kwargs0 = copy.deepcopy(kwargs)
+        for n in range(len(groups)):
+            k = list(groups.keys())[n]
+    
+            # Add additional arguments that need to be grouped to kwargs
+            if not grouped_args is None:
+                kwargs = copy.deepcopy(kwargs0)
+                for ka, v in grouped_args.items():
+                    kwargs[ka] = v[groups[k], ...]
+    
+            array_out[n, ...] = np.apply_along_axis(func, 0, array[groups[k], ...], *args, **kwargs)
+    
+        return array_out
+
diff --git a/da/tools/wrfchem/wrfchem_helper.py b/da/tools/wrfchem/wrfchem_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4b399294aaeb2a0e2f8418890d674fa12cb45d1
--- /dev/null
+++ b/da/tools/wrfchem/wrfchem_helper.py
@@ -0,0 +1,990 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jul 22 15:03:02 2019
+
+This class contains helper functions mainly for sampling WRF-Chem, but a
+few are also needed by the WRF-Chem column observation operator.
+
+@author: friedemann
+"""
+
+# Instructions for pylint:
+# pylint: disable=too-many-instance-attributes
+# pylint: disable=W0201
+# pylint: disable=C0301
+# pylint: disable=E1136
+# pylint: disable=E1101
+
+
+import os
+import shutil
+import re
+import glob
+import bisect
+import copy
+import numpy as np
+import netCDF4 as nc
+import datetime as dt
+import wrf
+import f90nml
+import pickle
+
+# CTDAS modules
+import da.tools.io4 as io
+from da.tools.wrfchem.utilities import utilities
+
+
+class WRFChemHelper(object):
+    """Contains helper functions for sampling WRF-Chem"""
+    def __init__(self, settings):
+        self.settings = settings
+
+    def validate_settings(self, needed_items=[]):
+        """
+        This is based on WRFChemOO._validate_rc
+        """
+
+        if len(needed_items)==0:
+            return
+
+        for key in needed_items:
+            if key not in self.settings:
+                msg = "Missing a required value in settings: %s" % key
+                raise IOError(msg)
+
+    @staticmethod
+    def read_namelist(dirname):
+        """Read run settings from namelist.input file in dirname"""
+        nml_file = os.path.join(dirname, "namelist.input")
+        namelist = f90nml.read(nml_file)
+        # Some values are lists when running with nested domains, but
+        # not when running with only one domain. Ensure here those
+        # values are always lists.
+        list_vars = ["e_we",
+                     "e_sn", 
+                     "parent_id",
+                     "parent_grid_ratio",
+                     "i_parent_start",
+                     "j_parent_start"
+                    ]
+        for v in list_vars:
+            if not isinstance(namelist["domains"][v], list):
+                namelist["domains"][v] = [namelist["domains"][v]]
+
+        return namelist
+
+    @staticmethod
+    def get_pressure_boundaries_paxis(p_axis, p_surf):
+        """
+        Arguments
+        ---------
+        p_axis (:class:`array-like`)
+            Pressure at mid points of layers
+        p_surf (:class:`numeric`)
+            Surface pressure
+        Output
+        ------
+        Pressure at layer boundaries
+        """
+
+        pb = np.array([float("nan")]*(len(p_axis)+1))
+        pb[0] = p_surf
+
+        for nl in range(len(pb)-1):
+            pb[nl+1] = pb[nl] + 2*(p_axis[nl] - pb[nl])
+
+        return pb
+
+    @staticmethod
+    def get_pressure_boundaries_znw(znw, p_surf, p_top):
+        """
+        Arguments
+        ---------
+        ZNW (:class:`ndarray`)
+            Eta coordinates of z-staggered WRF grid. For each
+            observation (2D)
+        p_surf (:class:`ndarray`)
+            Surface pressure (1D)
+        p_top (:class:`ndarray`)
+            Top-of-atmosphere pressure (1D)
+        Output
+        ------
+        Pressure at layer boundaries
+
+        CAVEATS
+        -------
+        Maybe I should rather use P_HYD? Well, Butler et al. 2018
+        (https://www.geosci-model-dev-discuss.net/gmd-2018-342/) used
+        znu and surface pressure to compute "WRF midpoint layer
+        pressure".
+
+        For WRF it would be more consistent to interpolate to levels.
+        See also comments in code.
+        """
+
+        return znw*(p_surf-p_top) + p_top
+
+
+    @staticmethod
+    def get_int_coefs(pb_ret, pb_mod, level_def):
+        """
+        Computes a coefficients matrix to transfer a model profile onto
+        a retrieval pressure axis.
+        
+        If level_def=="layer_average", this assumes that profiles are
+        constant in each layer of the retrieval, bound by the pressure
+        boundaries pb_ret. In this case, the WRF model layer is treated
+        in the same way, and coefficients integrate over the assumed
+        constant model layers. This works with non-staggered WRF
+        variables (on "theta" points). However, this is actually not how
+        WRF is defined, and the implementation should be changed to
+        z-staggered variables. Details for this change are in a comment
+        at the beginning of the code.
+
+        If level_def=="pressure_boundary" (IMPLEMENTATION IN PROGRESS),
+        assumes that profiles, kernel and pwf are defined at pressure
+        boundaries that don't have a thickness (this is how OCO-2 data
+        are defined, for example). In this case, the coefficients
+        linearly interpolate adjacent model level points. This is
+        incompatible with the treatment of WRF in the above-described
+        layer-average assumption, but is closer to how WRF is actually
+        defined. The exception is that pb_mod is still constructed and
+        non-staggered variables are not defined at psurf. This can only
+        be fixed by switching to z-staggered variables.
+    
+        In cases where retrieval surface pressure is higher than model
+        surface pressure, and in cases where retrieval top pressure is
+        lower than model top pressure, the model profile will be
+        extrapolated with constant tracer mixing ratios. In cases where
+        retrieval surface pressure is lower than model surface pressure,
+        and in cases where retrieval top pressure is higher than model
+        top pressure, only the parts of the model column that fall
+        within the retrieval presure boundaries are sampled.
+    
+        Arguments
+        ---------
+        pb_ret (:class:`array_like`)
+            Pressure boundaries of the retrieval column
+        pb_mod (:class:`array_like`)
+            Pressure boundaries of the model column
+        level_def (:class:`string`)
+            "layer_average" or "pressure_boundary" (IMPLEMENTATION IN
+            PROGRESS). Refers to the retrieval profile.
+            
+            Note 2021-09-13: Inspected code for pressure_boundary.
+            Should be correct. Interpolates linearly between two model
+            levels.
+
+    
+        Returns
+        -------
+        coefs (:class:`array_like`)
+                Integration coefficient matrix. Each row sums to 1.
+    
+        Usage
+        -----
+             .. code-block:: python
+    
+                 import numpy as np
+                 pb_ret = np.linspace(900., 50., 5)
+                 pb_mod = np.linspace(1013., 50., 7)
+                 model_profile = 1. - np.linspace(0., 1., len(pb_mod)-1)**3
+                 coefs = get_int_coefs(pb_ret, pb_mod, "layer_average")
+                 retrieval_profile = np.matmul(coefs, model_profile)
+        """
+    
+        if level_def == "layer_average":
+            # This code assumes that WRF variables are constant in
+            # layers, but they are defined on levels. This can be seen
+            # for example by asking wrf.interplevel for the value of a
+            # variable that is defined on the mass grid ("theta points")
+            # at a pressure slightly higher than the pressure on its
+            # grid (wrf.getvar(ncf, "p")), it returns nan. So There is
+            # no extrapolation. There are no layers. There are only
+            # levels.
+            # In addition, this page here:
+            # https://www.openwfm.org/wiki/How_to_interpret_WRF_variables
+            # says that to find values at theta-points of a variable
+            # living on u-points, you interpolate linearly. That's the
+            # other way around from what I would do if I want to go from
+            # theta to staggered.
+            # WRF4.0 user guide:
+            # - ungrib can interpolate linearly in p or log p
+            # - real.exe comes with an extrap_type namelist option, that
+            #   extrapolates constantly BELOW GROUND.
+            # This would mean the correct way would be to integrate over
+            # a piecewise-linear function. It also means that I really
+            # want the value at surface level, so I'd need the CO2
+            # fields on the Z-staggered grid ("w-points")! Interpolate
+            # the vertical in p with wrf.interp1d, example:
+            # wrf.interp1d(np.array(rh.isel(south_north=1, west_east=0)),
+            #              np.array(p.isel(south_north=1, west_east=0)),
+            #              np.array(988, 970))
+            # (wrf.interp1d gives the same results as wrf.interplevel,
+            # but the latter just doesn't want to work with single
+            # columns (32,1,1), it wants a dim>1 in the horizontal
+            # directions)
+            # So basically, I can keep using pb_ret and pb_mod, but it
+            # would be more accurate to do the piecewise-linear
+            # interpolation and the output matrix will have 1 more
+            # value in each dimension.
+            
+            # Calculate integration weights by weighting with layer
+            # thickness. This assumes that both axes are ordered
+            # psurf to ptop.
+            coefs = np.ndarray(shape=(len(pb_ret)-1, len(pb_mod)-1))
+            coefs[:] = 0.
+    
+            # Extend the model pressure grid if retrieval encompasses
+            # more.
+            pb_mod_tmp = copy.deepcopy(pb_mod)
+    
+            # In case the retrieval pressure is higher than the model
+            # surface pressure, extend the lowest model layer.
+            if pb_mod_tmp[0] < pb_ret[0]:
+                pb_mod_tmp[0] = pb_ret[0]
+    
+            # In case the model doesn't extend as far as the retrieval,
+            # extend the upper model layer upwards.
+            if pb_mod_tmp[-1] > pb_ret[-1]:
+                pb_mod_tmp[-1] = pb_ret[-1]
+    
+            # For each retrieval layer, this loop computes which
+            # proportion falls into each model layer.
+            for nret in range(len(pb_ret)-1):
+    
+                # 1st model pressure boundary index = the one before the
+                # first boundary with lower pressure than high-pressure
+                # retrieval layer boundary.
+                model_lower = pb_mod_tmp < pb_ret[nret]
+                id_model_lower = model_lower.nonzero()[0]
+                id_min = id_model_lower[0]-1
+    
+                # Last model pressure boundary index = the last one with
+                # higher pressure than low-pressure retrieval layer
+                # boundary.
+                model_higher = pb_mod_tmp > pb_ret[nret+1]
+    
+                id_model_higher = model_higher.nonzero()[0]
+    
+                if len(id_model_higher) == 0:
+                    #id_max = id_min
+                    raise ValueError("This shouldn't happen. Debug.")
+                else:
+                    id_max = id_model_higher[-1]
+    
+                # By the way, in case there is no model level with
+                # higher pressure than the next retrieval level,
+                # id_max must be the same as id_min.
+    
+                # For each model layer, find out how much of it makes up this
+                # retrieval layer
+                for nmod in range(id_min, id_max+1):
+                    if (nmod == id_min) & (nmod != id_max):
+                        # Part of 1st model layer that falls within
+                        # retrieval layer
+                        coefs[nret, nmod] = pb_ret[nret] - pb_mod_tmp[nmod+1]
+                    elif (nmod != id_min) & (nmod == id_max):
+                        # Part of last model layer that falls within
+                        # retrieval layer
+                        coefs[nret, nmod] = pb_mod_tmp[nmod] - pb_ret[nret+1]
+                    elif (nmod == id_min) & (nmod == id_max):
+                        # id_min = id_max, i.e. model layer encompasses
+                        # retrieval layer
+                        coefs[nret, nmod] = pb_ret[nret] - pb_ret[nret+1]
+                    else:
+                        # Retrieval layer encompasses model layer
+                        coefs[nret, nmod] = pb_mod_tmp[nmod] - pb_mod_tmp[nmod+1]
+    
+                coefs[nret, :] = coefs[nret, :]/sum(coefs[nret, :])
+    
+            # I tested the code with many cases, but I'm only 99.9% sure
+            # it works for all input. Hence a test here that the
+            # coefficients sum to 1 and dump the data if not.
+            sum_ = np.abs(coefs.sum(1) - 1)
+            if np.any(sum_ > 2.*np.finfo(sum_.dtype).eps):
+                dump = dict(pb_ret=pb_ret,
+                            pb_mod=pb_mod,
+                            level_def=level_def)
+                fp = "int_coefs_dump.pkl"
+                with open(fp, "w") as f:
+                    pickle.dump(dump, f, 0)
+    
+                msg_fmt = "Something doesn't sum to 1. Arguments dumped to: %s"
+                raise ValueError(msg_fmt % fp)
+                
+        elif level_def=="pressure_boundary":
+            #msg = "level_def is pressure_boundary. Implementation not complete."
+            ##logging.error(msg)
+            #raise ValueError(msg)
+            # Note 2021-09-13: Inspected the code. Should be correct.
+
+            # Go back to pressure midpoints for model...
+            # Change this line to p_mod = pb_mod for z-staggered
+            # variables
+            p_mod = pb_mod[1:] - 0.5*np.diff(pb_mod)
+            
+            coefs = np.ndarray(shape=(len(pb_ret), len(pb_mod)-1))
+            coefs[:] = 0.
+    
+            # For each retrieval pressure level, compute linear
+            # interpolation coefficients
+            for nret in range(len(pb_ret)):
+                nmod_list = (p_mod < pb_ret[nret]).nonzero()[0]
+                if(len(nmod_list)>0):
+                    nmod = nmod_list[0] - 1
+                    if nmod==-1:
+                        # Constant extrapolation at surface
+                        nmod = 0
+                        coef = 1.
+                    else:
+                        # Normal case:
+                        coef = (pb_ret[nret]-p_mod[nmod+1])/(p_mod[nmod]-p_mod[nmod+1])
+                else:
+                    # Constant extrapolation at atmosphere top
+                    nmod = len(p_mod)-2
+                    coef=0.
+                    
+                coefs[nret, nmod] = coef
+                coefs[nret, nmod+1] = 1.-coef
+    
+        else:
+            msg = "Unknown level_def: " + level_def
+            raise ValueError(msg)
+            
+        return coefs
+
+    @staticmethod
+    def get_pressure_weighting_function(pressure_boundaries, rule):
+        """
+        Compute pressure weighting function according to 'rule'.
+        Valid rules are:
+            - simple (=layer thickness)
+            - connor2008 (not implemented)
+        """
+        if rule == 'simple':
+            pwf = np.abs(np.diff(pressure_boundaries)/np.ptp(pressure_boundaries))
+        else:
+            raise NotImplementedError("Rule %s not implemented" % rule)
+
+        return pwf
+
+    def locate_domain(self, lat, lon):
+        """
+        Input
+        -----
+        lat: Single values or lists or np.arrays
+        lon: Single values or lists or np.arrays
+        
+        Output
+        ------
+        - xy-coordinates in finest domain that contains the coordinates
+        - finest domain
+
+        NOTE
+        ----
+        self.open_wrf_location_files() must be run first
+        """
+        
+        if not hasattr(self, "_loc_files"):
+            raise RuntimeError("Must call open_wrf_loc_files first.")
+            
+        # Work with arrays internally.
+        lat = np.array(lat, ndmin=1)
+        lon = np.array(lon, ndmin=1)
+
+        # Get coordinates of the observation in all domains
+        ndomains = self.namelist["domains"]["max_dom"]
+
+        # Get domain sizes in xy on mass (=unstaggered) grid (hence the -1)
+        dom_size_x = np.array(self.namelist['domains']['e_we'], dtype=float, ndmin=1) - 1
+        dom_size_y = np.array(self.namelist['domains']['e_sn'], dtype=float, ndmin=1) - 1
+
+        # Initialize  output
+        xy = np.zeros((len(lat), 2))
+        xy[:] = np.nan
+        finest_domain = np.zeros((len(lat), ), int)
+
+        # Since wrf.ll_to_xy takes very long, I save a bit of time here
+        # by starting in the finest domain and processing only
+        # observations that weren't previously found.
+        for n in range(ndomains-1, -1, -1):
+            # Get xy for this domain
+            # Only process what you haven't processed
+            sel = np.where(finest_domain == 0)[0]
+
+            # In case all domains where set
+            if len(sel)==0:
+                break
+
+            x, y = wrf.ll_to_xy(wrfin=self._loc_files[n],
+                                latitude=lat[sel],
+                                longitude=lon[sel],
+                                meta=False,
+                                as_int=False)
+
+
+            # For each domain, check if the observation is inside the
+            # domain extent
+            # I put the edges on -0.5 and e_we/e_sn - 0.5.
+            # Reason is that wrf.ll_to_xy(..., as_int = True) returns
+            # -0.2 as 0, so it's inside the domain.
+            # Also, for e_we=99, e_sn = 92, ref_lon=9, ref_lat=51.5:
+            # wrf.ll_to_xy(wrfin=self._loc_files[0], latitude=51.5, longitude=9., meta=False, as_int=False)
+            # array([48.49996124, 45.00003737])
+            # This means that int indices denote exactly the mass-staggered point.
+
+            # To be able to iterate over x and y in case they're scalars:
+            x = np.array(x, ndmin=1)
+            y = np.array(y, ndmin=1)
+
+            # Test: inside domain?
+            # The -1 here are because of 0-based indices, and the -/+ 0.5 are
+            # because that's halfway to the next mass-staggered point and
+            # I treat the WRF grid as boxes.
+            x_in = [(-0.5 <= x_) and (x_ <= dom_size_x[n] - 1 + 0.5) for x_ in x]
+            y_in = [(-0.5 <= y_) and (y_ <= dom_size_y[n] - 1 + 0.5) for y_ in y]
+
+            # Save domain, x and y at these locations
+            in_this_dom = np.where(np.logical_and(x_in, y_in))[0]
+            finest_domain[sel[in_this_dom]] = n + 1
+            # If domain = 1, set _all_ xy to see where they end up
+            if n==0:
+                xy[sel, 0] = x
+                xy[sel, 1] = y
+            else:
+                xy[sel[in_this_dom], 0] = x[in_this_dom]
+                xy[sel[in_this_dom], 1] = y[in_this_dom]
+
+
+        # Return the indices and domain
+        return xy, finest_domain
+
+    def get_groups_space_time(self, dat, time_bins, only_in=False):
+        """
+        Returns a dictionary of lists of indices of observations in dat,
+        where keys are tuples of (time bin, x bin, y bin and wrf
+        domain), and values are indices of the observations that fall
+        within this bin.
+        
+        Arguments
+        ---------
+        
+        dat : list
+            List containing measurement locations:
+            date (same type as time_bins), latitude, longitude
+        time_bins : list
+            List of boundaries of time bins (I use datetime.datetime
+            objects)
+        only_in : boolean
+            If only_in is True, groups that fall outside the time_bins
+            and domain are not returned.
+
+        """
+    
+        # Time groups
+        id_t = np.array([bisect.bisect_right(time_bins, dat_date)
+                         for dat_date in dat['date']],
+                        int)
+    
+        # Spatial groups (indices and domain)
+        id_xy_f, dom = self.locate_domain(dat['latitude'], dat['longitude'])
+    
+        id_xy = np.round(id_xy_f).astype(int)
+    
+        
+    
+        # Version of only_in with sel: might be faster if sel is short -
+        # I don't know! But the results for my test case where identical
+        # (meaning 'domain' looked correct and identical) for both
+        # options for only_in.
+
+        # if only_in:
+        #     # Yes, 0<id_t is correct: in bisect_right, it means the
+        #     # value is below the lowest sequence value.
+        #     time_in = np.logical_and(0<id_t, id_t<len(time_bins))
+        #     space_in = dom != 0
+        #     sel = np.where(np.logical_and(time_in, space_in))[0]
+        # 
+        # else:
+        #     sel = range(len(id_t))
+        # 
+        # indices = get_index_groups(id_t[sel], id_xy[sel, 0], id_xy[sel, 1], dom[sel])
+        # 
+        # # Now have to account for sel again!
+        # if only_in:
+        #     for k, v in indices.iteritems():
+        #         indices[k] = sel[v]
+    
+        # Version of only_in  without sel: might be faster if sel is
+        # long - I don't know! But the results for my test case where
+        # identical (meaning 'domain' looked correct and identical)
+    
+        # Indices for all groups:
+        indices = utilities.get_index_groups(id_t, id_xy[:, 0], id_xy[:, 1], dom)
+    
+        # Throw the out-of-domain ones out here already
+        if only_in:
+            # Remove the index groups where domain is 0 (= outside of
+            # domain). Need to iterate over a list, because with an
+            # iterator, python complains that the dictionary changed
+            # size during iteration.
+            for k in list(indices.keys()):
+                if k[3] == 0:
+                    del indices[k]
+            # Equivalent to the above 3 lines, don't know what's faster:
+            # groups = {k: v for k, v in groups.iteritems() if k[3] != 0}
+    
+        return indices
+
+
+    @staticmethod
+    def times_in_wrf_file(ncf):
+        """
+        Returns the times in netCDF4.Dataset ncf as datetime object
+        """
+        times_nc = ncf.variables["Times"]
+        times_chr = []
+        for nt in range(times_nc.shape[0]):
+            # freum 2021-07-11: with the migration to python3, need to
+            # replace the string - conversion. Hope "utf-8" works
+            # always.
+            # times_chr.append(times_nc[nt, :].tostring())
+            times_chr.append(str(times_nc[nt, :], "utf-8"))
+
+        times_dtm = [dt.datetime.strptime(t_chr, "%Y-%m-%d_%H:%M:%S")
+                     for t_chr in times_chr]
+
+        return times_dtm
+
+    def wrf_times(self, file_list):
+        """Read all times in a list of wrf files
+
+        Output
+        ------
+        - 1D-array containing all times
+        - 1D-array containing start indices of each file
+        """
+
+        times = list()
+        start_indices = np.ndarray((len(file_list), ), int)
+        for nf in range(len(file_list)):
+            ncf = nc.Dataset(file_list[nf])
+            times_this = self.times_in_wrf_file(ncf)
+            start_indices[nf] = len(times)
+            times += times_this
+            ncf.close()
+
+        return times, start_indices
+
+    def open_wrf_location_files(self):
+        """
+        Keep a description of a small wrf file for each domain in memory
+        to locate observations.
+        Appends _loc_file to self.
+        
+        Note: Should be edited out of the code.
+        """
+
+        ndomains = self.namelist["domains"]["max_dom"]
+        path = self.settings["run_dir"]
+        pattern = "wrfinput_d%02d"
+        self._loc_files = list()
+        for nd in range(1, ndomains+1):
+            fp = os.path.join(path, pattern % nd)
+            self._loc_files.append(nc.Dataset(fp, "r"))
+
+    def close_wrf_location_files(self):
+        """See _open_wrf_location_files"""
+        for loc_file in self._loc_files:
+            loc_file.close()
+
+    def wrf_filename_times(self, prefix):
+        """Get timestamps in wrf file names in run directory."""
+
+        # List all filenames
+        files = self.get_wrf_filenames(prefix + "*")
+        # Only use d01 files, pattern should be the same for all domains
+        pattern = os.path.join(self.settings["run_dir"], prefix)
+        files = [f for f in files if re.search(pattern, f)]
+        # Extract timestamp from filename
+        # Format is %Y-%m-%d_%H:%M:%S at the end of the filename
+        pattern_time = "%Y-%m-%d_%H:%M:%S"
+        len_tstamp = len(pattern_time) + 2
+        times = [dt.datetime.strptime(f[-len_tstamp:], pattern_time)
+                 for f in files]
+
+        return times
+
+    def get_wrf_filenames(self, glob_pattern):
+        """
+        Gets the filenames in self.settings["run_dir"] that follow
+        glob_pattern, excluding those that end with
+        self.settings["original_save_suffix"]
+        """
+        path = self.settings["run_dir"]
+        # All files...
+        wfiles = glob.glob(os.path.join(path, glob_pattern))
+        # All originals
+        orig_suf = self.settings["original_save_suffix"]
+        opattern = glob_pattern + orig_suf
+        ofiles = glob.glob(os.path.join(path, opattern))
+
+        # All files except all originals:
+        files = [x for x in wfiles if x not in ofiles]
+
+        # I need this sorted too often to not do it here.
+        files = np.sort(files).tolist()
+        return files
+
+
+    def sample_total_columns(self, dat, loc, fields_list):
+        """
+        Sample total_columns of fields_list in WRF output in
+        self.settings["run_dir"] at the location id_xy in domain, id_t
+        in all wrfout-times. Files and indices therein are recognized
+        by id_t and file_time_start_indices.
+        All quantities needed for computing total columns from profiles
+        are in dat (kernel, prior, ...).
+
+        Arguments
+        ---------
+        dat (:class:`list`)
+            Result of wrfhelper.read_sampling_coords. Used here: prior,
+            prior_profile, kernel, psurf, pressure_axis, [, pwf]
+            If psurf or any of pressure_axis are nan, wrf's own
+            surface pressure is used and pressure_axis constructed
+            from this and the number of levels in the averaging kernel.
+            This allows sampling with synthetic data that don't have
+            pressure information. This only works with level_def
+            "layer_average".
+            If pwf is not present or nan, a simple one is created, for
+            level_def "layer_average".
+        loc (:class:`dict`)
+            A dictionary with all location-related input for sampling,
+            computed in wrfout_sampler. Keys:
+            id_xy, domain: Domain coordinates
+            id_t: Timestep (continous throughout all files)
+            frac_t: Interpolation coeficient between id_t and id_t+1:
+                    t_obs = frac_t*t[id_t] + (1-frac_t)*t[id_t+1])
+            file_start_time_indices: Time index at which a new wrfout
+                                     file starts
+            files: names of wrfout files.
+        fields_list (:class:`list`)
+            The fields to sample total columns from.
+
+        Output
+        ------
+        sampled_columns (:class:`array`)
+            A 2D-array of sampled columns.
+            Shape: (len(dat["prior"]), len(fields_list))
+        """
+
+        # Initialize output
+        tc = np.ndarray(shape=(len(dat["prior"]), len(fields_list)), dtype=float)
+        tc[:] = float("nan")
+
+        # Process by domain
+        UD = list(set(loc["domain"]))
+        for dom in UD:
+            idd = np.nonzero(loc["domain"] == dom)[0]
+            # Process by id_t
+            UT = list(set(loc["id_t"][idd]))
+            for time_id in UT:
+                # Coordinates to process
+                idt = idd[np.nonzero(loc["id_t"][idd] == time_id)[0]]
+                # Get tracer ensemble profiles
+                profiles = self._read_and_intrp_v(loc, fields_list, time_id, idt)
+                # List, len=len(fields_list), shape of each: (len(idt),nz)
+                # Get pressure axis:
+                #paxis = self.read_and_intrp(wh_names, id_ts, frac_t, id_xy, "P_HYD")/1e2 # Pa -> hPa
+                psurf = self._read_and_intrp_v(loc, ["PSFC"], time_id, idt)[0]/1.e2 # Pa -> hPa
+                # Shape: (len(idt),)
+                ptop = float(self.namelist["domains"]["p_top_requested"])/1.e2
+                # Shape: (len(idt),)
+                znw = self._read_and_intrp_v(loc, ["ZNW"], time_id, idt)[0]
+                #Shape:(len(idt),nz)
+
+                # DONE reading from file.
+                # Here it starts to make sense to loop over individual observations
+                for nidt in range(len(idt)):
+                    nobs = idt[nidt]
+                    # Construct model pressure layer boundaries
+                    pb_mod = self.get_pressure_boundaries_znw(znw[nidt, :], psurf[nidt], ptop)
+
+                    if (np.diff(pb_mod) >= 0).any():
+                        msg = ("Model pressure boundaries for observation %d " + \
+                               "are not monotonically decreasing! Investigate.") % nobs
+                        raise ValueError(msg)
+
+                    # Construct retrieval pressure layer boundaries
+                    if dat["level_def"][nobs] == "layer_average":
+                        if np.any(np.isnan(dat["pressure_levels"][nobs])) \
+                           or np.isnan(dat["psurf"][nobs]):
+                            # Code for synthetic data without a pressure axis,
+                            # but with an averaging kernel:
+                            # Use wrf's surface and top pressure
+                            nlayers = len(dat["averaging_kernel"][nobs])
+                            pb_ret = np.linspace(psurf[nidt], ptop, nlayers+1)
+                        else:
+                            pb_ret = self.get_pressure_boundaries_paxis(
+                                    dat["pressure_levels"][nobs],
+                                    dat["psurf"][nobs])
+                    elif dat["level_def"][nobs] == "pressure_boundary":
+                        if np.any(np.isnan(dat["pressure_levels"][nobs])):
+                            # Code for synthetic data without a pressure axis,
+                            # but with an averaging kernel:
+                            # Use wrf's surface and top pressure
+                            nlevels = len(dat["averaging_kernel"][nobs])
+                            pb_ret = np.linspace(psurf[nidt], ptop, nlevels)
+                        else:
+                            pb_ret = dat["pressure_levels"][nobs]
+
+                    if (np.diff(pb_ret) >= 0).any():
+                        msg = ("Retrieval pressure boundaries for " + \
+                               "observation %d are not monotonically " + \
+                               "decreasing! Investigate.") % nobs
+                        raise ValueError(msg)
+
+                    # Get vertical integration coefficients (i.e. to
+                    # "interpolate" from model to retrieval grid)
+                    coef_matrix = self.get_int_coefs(pb_ret, pb_mod, dat["level_def"][nobs])
+
+                    # Model retrieval with averaging kernel and prior profile
+                    if "pressure_weighting_function" in list(dat.keys()):
+                        pwf = dat["pressure_weighting_function"][nobs]
+                    if (not "pressure_weighting_function" in list(dat.keys())) or np.any(np.isnan(pwf)):
+                        # Construct pressure weighting function from
+                        # pressure boundaries
+                        pwf = self.get_pressure_weighting_function(pb_ret, rule="simple")
+                        
+                    # Compute pressure-weighted averaging kernel
+                    avpw = pwf*dat["averaging_kernel"][nobs]
+
+                    # Get prior
+                    prior_col = dat["prior"][nobs]
+                    prior_profile = dat["prior_profile"][nobs]
+                    if np.isnan(prior_col): # compute prior
+                        prior_col = np.dot(pwf, prior_profile)
+
+                    # Compute total columns
+                    for nf in range(len(fields_list)):
+                        # Integrate model profile
+                        profile_intrp = np.matmul(coef_matrix, profiles[nf][nidt, :])
+
+                        # Model retrieval
+                        tc[nobs, nf] = prior_col + np.dot(avpw, profile_intrp - prior_profile)
+
+                        # Test phase: save pb_ret, pb_mod, coef_matrix,
+                        # one profile for manual checking
+
+                        # dat_save = dict(pb_ret=pb_ret,
+                        #                pb_mod=pb_mod,
+                        #                coef_matrix=coef_matrix,
+                        #                ens_profile=ens_profiles[0],
+                        #                profile_intrp=profile_intrp,
+                        #                id=dat.id)
+                        #
+                        #out = open("model_profile_%d.pkl"%dat.id, "w")
+                        #cPickle.dump(dat_save, out, 0)
+        # Average over footprint
+        if self.settings["footprint_samples_dim"] > 1:
+            indices = utilities.get_index_groups(dat["sounding_id"])
+            
+            # Make sure that this is correct: i know the number of indices
+            lens = [len(group) for group in list(indices.values())]
+            correct_len = self.settings["footprint_samples_dim"]**2
+            if np.any([len_ != correct_len for len_ in set(lens)]):
+                raise ValueError("Not all footprints have %d samples" %correct_len)
+            # Ok, paranoid mode, also confirm that the indices are what I
+            # think they are: consecutive numbers
+            ranges = [np.ptp(group) for group in list(indices.values())]
+            if np.any([ptp != correct_len for ptp in set(ranges)]):
+                raise ValueError("Not all footprints have consecutive samples")
+            
+            tc_original = copy.deepcopy(tc)
+            tc = utilities.apply_by_group(np.average, tc_original, indices)
+            
+        return tc
+
+    @staticmethod
+    def _read_and_intrp_v(loc, fields_list, time_id, idp):
+        """
+        Helper function for sample_total_columns.
+        read_and_intrp, but vectorized.
+        Reads in fields and interpolates
+        them linearly in time.
+        
+        Arguments
+        ----------
+        loc (:class:`dict`)
+            Passed through from sample_total_columns, see there.
+        fields_list (:class:`list` of :class:`str`)
+            List of netcdf-variables to process.
+        time_id (:class:`int`)
+            Time index referring to all files in loc to read
+        idp (:class:`array` of :class:`int`)
+            Indices for id_xy, domain and frac_t in loc (i.e.
+            observations) to process.
+        
+        Output
+        ------
+        List of temporally interpolated fields, one entry per member of
+        fields_list.
+        """
+
+        var_intrp_l = list()
+
+        # Check we were really called with observations for just one domain
+        domains = set(loc["domain"][idp])
+        if len(domains) > 1:
+            raise ValueError("I can only operate on idp with identical domains.")
+        dom = domains.pop()
+
+        # Select input files
+        id_file0 = bisect.bisect_right(loc["file_start_time_indices"][dom], time_id) - 1
+        id_file1 = bisect.bisect_right(loc["file_start_time_indices"][dom], time_id+1) - 1
+        if id_file0 < 0 or id_file1 < 0:
+            raise ValueError("This shouldn't happen.")
+
+        # Get time id in file
+        id_t_file0 = time_id - loc["file_start_time_indices"][dom][id_file0]
+        id_t_file1 = time_id+1 - loc["file_start_time_indices"][dom][id_file1]
+
+        # Open files
+        nc0 = nc.Dataset(loc["files"][dom][id_file0], "r")
+        nc1 = nc.Dataset(loc["files"][dom][id_file1], "r")
+        # Per field to sample
+        for field in fields_list:
+            # Read input file
+            field0 = wrf.getvar(wrfin=nc0,
+                                varname=field,
+                                timeidx=id_t_file0,
+                                squeeze=False,
+                                meta=False)
+
+            field1 = wrf.getvar(wrfin=nc1,
+                                varname=field,
+                                timeidx=id_t_file1,
+                                squeeze=False,
+                                meta=False)
+
+            if len(field0.shape) == 4:
+                # Sample field at timesteps before and after observation
+                # They are ordered nt x nz x ny x nx
+                # var0 will have shape (len(idp),len(profile))
+                var0 = field0[0, :, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]]
+                var1 = field1[0, :, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]]
+                # Repeat frac_t for profile size
+                frac_t_ = np.array(loc["frac_t"][idp]).reshape((len(idp), 1)).repeat(var0.shape[1], 1)
+            elif len(field0.shape) == 3:
+                # var0 will have shape (len(idp),)
+                var0 = field0[0, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]]
+                var1 = field1[0, loc["id_xy"][idp, 1], loc["id_xy"][idp, 0]]
+                frac_t_ = np.array(loc["frac_t"][idp])
+            elif len(field0.shape) == 2:
+                # var0 will have shape (len(idp),len(profile))
+                # This is for ZNW, which is saved as (time_coordinate,
+                # vertical_coordinate)
+                var0 = field0[[0]*len(idp), :] 
+                var1 = field1[[0]*len(idp), :]
+                frac_t_ = np.array(loc["frac_t"][idp]).reshape((len(idp), 1)).repeat(var0.shape[1], 1)
+            else:
+                raise ValueError("Can't deal with field with %d dimensions." % len(field0.shape))
+
+            # Interpolate in time
+            var_intrp_l.append(var0*frac_t_ + var1*(1. - frac_t_))
+
+        nc0.close()
+        nc1.close()
+
+        return var_intrp_l
+
+    @staticmethod
+    def read_sampling_coords(sampling_coords_file, id0=None, id1=None):
+        """Read in samples"""
+
+        ncf = nc.Dataset(sampling_coords_file, "r")
+        if id0 is None:
+            id0 = 0
+        if id1 is None:
+            id1 = len(ncf.dimensions['soundings'])
+
+        dat = dict(
+            sounding_id=np.array(ncf.variables["sounding_id"][id0:id1]),
+            date=ncf.variables["date"][id0:id1],
+            latitude=np.array(ncf.variables["latitude"][id0:id1]),
+            longitude=np.array(ncf.variables["longitude"][id0:id1]),
+            latc_0=np.array(ncf.variables["latc_0"][id0:id1]),
+            latc_1=np.array(ncf.variables["latc_1"][id0:id1]),
+            latc_2=np.array(ncf.variables["latc_2"][id0:id1]),
+            latc_3=np.array(ncf.variables["latc_3"][id0:id1]),
+            lonc_0=np.array(ncf.variables["lonc_0"][id0:id1]),
+            lonc_1=np.array(ncf.variables["lonc_1"][id0:id1]),
+            lonc_2=np.array(ncf.variables["lonc_2"][id0:id1]),
+            lonc_3=np.array(ncf.variables["lonc_3"][id0:id1]),
+            prior=np.array(ncf.variables["prior"][id0:id1]),
+            prior_profile=np.array(ncf.variables["prior_profile"][id0:id1,]),
+            averaging_kernel=np.array(ncf.variables["averaging_kernel"][id0:id1]),
+            pressure_levels=np.array(ncf.variables["pressure_levels"][id0:id1]),
+            pressure_weighting_function=np.array(ncf.variables["pressure_weighting_function"][id0:id1]),
+            level_def=ncf.variables["level_def"][id0:id1],
+            psurf=np.array(ncf.variables["psurf"][id0:id1])
+            )
+
+        ncf.close()
+
+        # Convert level_def from it's weird nc format to string
+        dat["level_def"] = nc.chartostring(dat["level_def"])
+
+        # Convert date to datetime object
+        dat["time"] = [dt.datetime(*x) for x in dat["date"]]
+
+        return dat
+
+    @staticmethod
+    def write_simulated_columns(obs_id, simulated, nmembers, outfile):
+        """Write simulated observations to file."""
+
+        # Output format: see obs_xco2_fr
+
+        f = io.CT_CDF(outfile, method="create")
+
+        dimid = f.createDimension("sounding_id", size=None)
+        dimid = ("sounding_id",)
+        savedict = io.std_savedict.copy()
+        savedict["name"] = "sounding_id"
+        savedict["dtype"] = "int64"
+        savedict["long_name"] = "Unique_Dataset_observation_index_number"
+        savedict["units"] = ""
+        savedict["dims"] = dimid
+        savedict["comment"] = "Format as in input"
+        savedict["values"] = obs_id.tolist()
+        f.add_data(savedict, nsets=0)
+
+        dimmember = f.createDimension("nmembers", size=nmembers)
+        dimmember = ("nmembers",)
+        savedict = io.std_savedict.copy()
+        savedict["name"] = "column_modeled"
+        savedict["dtype"] = "float"
+        savedict["long_name"] = "Simulated total column"
+        savedict["units"] = "??"
+        savedict["dims"] = dimid + dimmember
+        savedict["comment"] = "Simulated model value created by WRFChemOO"
+        savedict["values"] = simulated.tolist()
+        f.add_data(savedict, nsets=0)
+
+        f.close()
+
+    @staticmethod
+    def save_file_with_timestamp(file_path, out_dir, suffix=""):
+        """ Saves a file to with a timestamp"""
+        nowstamp = dt.datetime.now().strftime("_%Y-%m-%d_%H:%M:%S")
+        new_name = os.path.basename(file_path) + suffix + nowstamp
+        new_path = os.path.join(out_dir, new_name)
+        shutil.copy2(file_path, new_path)
+    
+
+if __name__ == "__main__":
+    pass
diff --git a/da/tools/wrfchem/wrfout_sampler.py b/da/tools/wrfchem/wrfout_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..03bc1fae0743e32259ab038fdbbe56f3b62d6218
--- /dev/null
+++ b/da/tools/wrfchem/wrfout_sampler.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jul 22 15:07:13 2019
+
+@author: friedemann
+"""
+
+# Samples wrfchem history files (wrfout*) for CTDAS
+
+# This is called as external executable from CTDAS
+# to allow simple parallelization
+#
+# Usage:
+
+# wrfout_sampler.py --arg1 val1 --arg2 val2 ...
+# Arguments: See parser in code below
+
+import os
+import sys
+#import itertools
+#import bisect
+import copy
+import numpy as np
+import netCDF4 as nc
+
+# Import some CTDAS tools
+pd = os.path.pardir
+inc_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                        pd, pd, pd)
+inc_path = os.path.abspath(inc_path)
+sys.path.append(inc_path)
+from da.tools.wrfchem.wrfchem_helper import WRFChemHelper
+from da.tools.wrfchem.utilities import utilities
+import argparse
+
+########## Parse options
+parser = argparse.ArgumentParser()
+parser.add_argument("--nproc", type=int,
+                    help="ID of this sampling process (0 ... nprocs-1)")
+parser.add_argument("--nprocs", type=int,
+                    help="Number of sampling processes")
+parser.add_argument("--sampling_coords_file", type=str,
+                    help="File with sampling coordinates as created " + \
+                         "by CTDAS column samples object")
+parser.add_argument("--run_dir", type=str,
+                    help="Directory with wrfout files")
+parser.add_argument("--original_save_suffix", type=str,
+                    help="Just leave this on .original")
+parser.add_argument("--nmembers", type=int,
+                    help="Number of tracer ensemble members")
+parser.add_argument("--tracer_optim", type=str,
+                    help="Tracer that was optimized (e.g. CO2 for " + \
+                         "ensemble members CO2_000 etc.)")
+parser.add_argument("--outfile_prefix", type=str,
+                    help="One process: output file. More processes: " + \
+                         "output file is <outfile_prefix>.<nproc>.slice")
+parser.add_argument("--footprint_samples_dim", type=int,
+                    help="Sample column footprint at n x n points")
+
+args = parser.parse_args()
+settings = copy.deepcopy(vars(args))
+
+# Start (stupid) logging - should be updated
+wd = os.getcwd()
+try:
+    os.makedirs("log")
+except OSError: # Case when directory already exists. Will look nicer in python 3...
+    pass
+
+logfile = os.path.join(wd, "log/wrfout_sampler." + str(settings['nproc']) + ".log")
+
+os.system("touch " + logfile)
+os.system("rm " + logfile)
+os.system("echo 'Process " + str(settings['nproc']) + " of " + str(settings['nprocs']) + ": start' >> " + logfile)
+os.system("date >> " + logfile)
+
+########## Initialize wrfhelper
+wrfhelper = WRFChemHelper(settings)
+wrfhelper.validate_settings(['sampling_coords_file',
+                             'run_dir',
+                             'nproc',
+                             'nprocs',
+                             'original_save_suffix', # necessary for selecting filename
+                             'nmembers', # special case 0: sample 'tracer_optim'
+                             'tracer_optim',
+                             'outfile_prefix',
+                             'footprint_samples_dim'])
+
+cwd = os.getcwd()
+os.chdir(wrfhelper.settings['run_dir'])
+
+#wrfhelper.namelist = wrfhelper.read_namelist(wrfhelper.settings['run_dir'])
+wrfhelper.namelist = wrfhelper.read_namelist(".")
+
+
+########## Figure out which samples to process
+# Get number of samples
+ncf = nc.Dataset(settings['sampling_coords_file'], "r")
+nsamples = len(ncf.dimensions['soundings'])
+ncf.close()
+
+id0, id1 = utilities.get_slicing_ids(nsamples, settings['nproc'], settings['nprocs'])
+
+os.system("echo 'id0=" + str(id0) + "' >> " + logfile)
+os.system("echo 'id1=" + str(id1) + "' >> " + logfile)
+
+########## Read samples from coord file
+dat = wrfhelper.read_sampling_coords(settings['sampling_coords_file'], id0, id1)
+
+os.system("echo 'Data read, len=" + str(len(dat['sounding_id'])) + "' >> " + logfile)
+
+
+########## Locate samples in wrf domains
+
+# Take care of special case without ensemble
+nmembers = settings['nmembers']
+if nmembers == 0:
+    # Special case: sample 'tracer_optim', don't add member suffix
+    member_names = [settings['tracer_optim']]
+    nmembers = 1
+else:
+    member_names = [settings['tracer_optim'] + "_%03d" % nm for nm in range(nmembers)]
+
+
+# Keep a description of a small wrf file for each domain in memory to
+# locate observations.
+# This concept is probably obsolete - doesn't save time, and
+# locate_domain is parallelized anyway
+wrfhelper.open_wrf_location_files()
+
+if settings["footprint_samples_dim"]==1:
+    # Locate all observations in space
+    # This function Wouldn't work for moving nests.
+    id_xy_f, domain = wrfhelper.locate_domain(dat['latitude'], dat['longitude'])
+    # Assume box averages and don't interpolate horizontally
+    id_xy = np.round(id_xy_f).astype(int)
+else:
+    # Return the whole thing (needed in wrfhelper.sample_total_columns)
+    raise NotImplementedError("To do: wrfhelper.get_footprint_sampling_points")
+    dat_fs = wrfhelper.get_footprint_sampling_points(dat)
+    # Locate (free)
+    id_xy_f_free, domain_free = wrfhelper.locate_domain(dat_fs['latitude'], dat_fs['longitude'])
+    id_xy_free = np.round(id_xy_f_free).astype(int)
+    # Determine max domain
+    domain_fs = None
+    raise NotImplementedError("To do: domain_fs")
+    # Sample again with domain restriction - no need to return it again
+    id_xy_f, _ = wrfhelper.locate_domain(dat_fs['latitude'], dat_fs['longitude'], domain_fs)
+    id_xy = np.round(id_xy_f).astype(int)
+    # Thin out domain_fs to pass it to determination of id_t and frac_t below
+    domain = domain_fs[::settings["footprint_samples_dim"]]
+    
+wrfhelper.close_wrf_location_files()
+
+# Locate in time: Which file, time index, and temporal interpolation
+# factor.
+# MAYBE make this a function. See which quantities I need later.
+id_t = np.zeros_like(domain)
+frac_t = np.ndarray(id_t.shape, float)
+frac_t[:] = float("nan")
+# Add a little flexibility by doing this per domain - namelists allow
+# different output frequencies per domain.
+wrfout_files = dict()
+wrfout_times = dict()
+wrfout_start_time_ids = dict()
+
+UD = list(set(domain))
+for dom in UD:
+    os.system("echo 'Processing domain " + str(dom) + "' >> " + logfile)
+    idd = np.where(domain == dom)[0]
+    # Get full time vector
+    wrfout_files[dom] = wrfhelper.get_wrf_filenames("wrfout_d%02d_*00" % dom)
+    wrfout_times[dom], wrfout_start_time_ids[dom] = wrfhelper.wrf_times(wrfout_files[dom])
+
+    # time id
+    for idd_ in idd:
+        # Look where it sorts in
+        tmp = [i
+               for i in range(len(wrfout_times[dom])-1)
+               if wrfout_times[dom][i] <= dat['time'][idd_] \
+                  and dat['time'][idd_] < wrfout_times[dom][i+1]]
+        # Catch the case that the observation took place exactly at the
+        # last timestep
+        if len(tmp) == 1:
+            id_t[idd_] = tmp[0]
+            time0 = wrfout_times[dom][id_t[idd_]]
+            time1 = wrfout_times[dom][id_t[idd_]+1]
+            frac_t[idd_] = (time1 - dat['time'][idd_]).total_seconds() / (time1 - time0).total_seconds()
+        else: # len must be 0 in this case
+            if len(tmp) > 1:
+                os.system("echo 'wat' >> " + logfile)
+                raise ValueError("wat")
+            if dat['time'][idd_] == wrfout_times[dom][-1]:
+                id_t[idd_] = len(wrfout_times[dom])-1
+                frac_t[idd_] = 1
+            else:
+                msg = "Sample %d, sounding_id %s: outside of simulated time."%(idd_, dat['sounding_id'][idd_])
+                os.system("echo '" + msg + "' >> " + logfile)
+                raise ValueError(msg)
+
+
+# Now read the data
+# Input: id_xy, dom, id_t, wrfout_start_time_ids, fract_t
+# Output: sampled columns
+# All input related to location:
+if settings["footprint_samples_dim"]>1:
+    domain = domain_fs
+    id_t = np.repeat(id_t, settings["footprint_samples_dim"])
+    frac_t = np.repeat(frac_t, settings["footprint_samples_dim"])
+
+loc_input = dict(id_xy=id_xy, domain=domain,
+                 id_t=id_t, frac_t=frac_t,
+                 files=wrfout_files, file_start_time_indices=wrfout_start_time_ids)
+
+ens_sim = wrfhelper.sample_total_columns(dat, loc_input, member_names)
+
+# Write results to file
+obs_ids = dat['sounding_id']
+# Remove simulations that are nan (=not in domain)
+if ens_sim.shape[0] > 0:
+    valid = np.apply_along_axis(lambda arr: not np.any(np.isnan(arr)), 1, ens_sim)
+    obs_ids_write = obs_ids[valid]
+    ens_sim_write = ens_sim[valid, :]
+else:
+    obs_ids_write = obs_ids
+    ens_sim_write = ens_sim
+
+if settings['nprocs'] == 1:
+    outfile = settings['outfile_prefix']
+else:
+    # Create output files with the appendix ".<nproc>.slice"
+    # Format <nproc> so that they can later be easily sorted.
+    len_nproc = int(np.floor(np.log10(settings['nprocs']))) + 1
+    outfile = settings['outfile_prefix'] + (".%0" + str(len_nproc) + "d.slice") % settings['nproc']
+
+os.system("echo 'Writing output file '" + os.path.join(wrfhelper.settings['run_dir'], outfile) + " >> " + logfile)
+
+wrfhelper.write_simulated_columns(obs_id=obs_ids_write,
+                                  simulated=ens_sim_write,
+                                  nmembers=nmembers,
+                                  outfile=outfile)
+
+os.chdir(cwd)
+
+os.system("echo 'Done' >> " + logfile)