Commit 8f88ead7 authored by Peters, Wouter's avatar Peters, Wouter
Browse files

Slowly getting closer to the new design. Observations and StateVector are now...

Slowly getting closer to the new design. Observations and StateVector are now class based, but objects are getting more elaborate each time. One remaining problem is the relation between the StateVector, the ensemble members, and the observations. These are all linked but no object of one type can easily contain the other. 

For instance, does a model mixing ratio sample belong to an observation object because that is what it represents and is derived from? Or is it part of an ensemble member object because each member creates one sample set? And does a member object belong to the state vector and if so, how does it play together with the ohter time steps in the filter? Are these all separate StateVectoir objects or is there only one StateVector with nlag sets of data??

The issue with nlag is not nicely resolved now.

Next task is to build the optimizer such that it takes all info from the different objects and performs a LSQM.
parent cfc6bb67
...@@ -12,6 +12,7 @@ import os ...@@ -12,6 +12,7 @@ import os
import sys import sys
import rc import rc
import logging import logging
import datetime
identifier = 'CarbonTracker CO2' identifier = 'CarbonTracker CO2'
...@@ -23,247 +24,502 @@ needed_rc_items = ['obs.input.dir', ...@@ -23,247 +24,502 @@ needed_rc_items = ['obs.input.dir',
'deltaco2.prefix', 'deltaco2.prefix',
'regtype'] 'regtype']
def MakeResiduals(rc_da_shell): ################### Begin Class DaInfo ###################
""" Method makes residuals of CO2 mixing ratios from the observed and sampled values """
import pyhdf.SD as HDF
# Read observations NetCDF file class DaInfo():
""" Information on the data assimilation system used. For CarbonTracker, this is an rc-file with settings.
The settings list includes at least the values above, in needed_rc_items.
"""
ObsFileName = os.path.join(rc_da_shell['dir.output'],rc_da_shell['obs.full.filename']) def __init__(self,rcfilename):
ncf = HDF.SD(ObsFileName) """
obsdates = ncf.select('decimal_date').get() Initialization occurs from passed rc-file name
dummy = ncf.end() """
msg = "Successfully read data from obs file (%s)"%ObsFileName ; logging.debug(msg)
# Read samples NetCDF file self.LoadRc(rcfilename)
self.ValidateRC()
SamplesFileName = ObsFileName.replace('observations','samples').replace('input','output') def __str__(self):
ncf = HDF.SD(SamplesFileName) """
sampdates = ncf.select('decimal_date').get() String representation of a DaInfo object
dummy = ncf.end() """
msg = "Successfully read data from model sample file (%s)"%SamplesFileName ; logging.debug(msg)
if (obsdates==sampdates).all(): msg = "===============================================================" ; print msg
pass msg = "DA System Info rc-file is %s" % self.RcFileName ; print msg
msg = "===============================================================" ; print msg
else: return ""
msg = "Dates in the observation file and samples file are not equal, exiting..." ; logging.error(msg)
raise IOError,msg
msg = "Residuals created, continuing" ; logging.info(msg)
return None def LoadRc(self,RcFileName):
"""
This method loads a DA System Info rc-file with settings for this simulation
"""
self.da_settings = rc.read(RcFileName)
self.RcFileName = RcFileName
self.DaRcLoaded = True
def PrepareObs(rc_da_shell,rc_da_system,type='forecast'): msg = 'DA System Info rc-file (%s) loaded successfully'%self.RcFileName ; logging.info(msg)
""" PrepareObs """
import pycdf as CDF return True
import shutil
saveas = os.path.join(rc_da_shell['dir.input'],'observations.nc')
# ncf = CDF.CDF(saveas,CDF.NC.WRITE|CDF.NC.TRUNC|CDF.NC.CREATE)
# ncf.CreationDate = datetime.datetime.now().strftime('%Y %B %d')
# ncf.CreationUser = os.environ['USER']
# ncf.CreationDir = rc_da_shell['dir.da_submit']
# ncf.CreationScript = sys.argv[0]
# ncf.close()
# msg = 'Created a new observation input file (%s) ' % saveas ; logging.info(msg) def ValidateRC(self):
"""
Validate the contents of the rc-file given a dictionary of required keys
"""
# For carbontracker the observations already come in netcdf format thanks to Andy and Ken. We will therefore simply merge for k,v in self.da_settings.iteritems():
# the observations.nc file with the pre-cooked obs files if v == 'True' : self.da_settings[k] = True
if v == 'False': self.da_settings[k] = False
for key in needed_rc_items:
# Select observation input file for this time step if not self.da_settings.has_key(key):
#sfname = rc_da_system['obs.input.fname']+'.%s'%(rc_da_shell['startdate'].strftime('%Y%m%d') )+'.nc' status,msg = ( False,'Missing a required value in rc-file : %s' % key)
sfname = rc_da_system['obs.input.fname']+'.%s'%('20050305')+'.nc' logging.error(msg)
msg = 'This filename is hardcoded but needs replacement' ; logging.warning(msg) raise IOError,msg
filename = os.path.join(rc_da_system['obs.input.dir'],sfname)
if not os.path.exists(filename): status,msg = ( True,'DA System Info settings have been validated succesfully' ) ; logging.debug(msg)
msg = 'Could not find the required observation input file (%s) ' % filename ; logging.error(msg) return None
raise IOError,msg ################### End Class DaInfo ###################
else:
dummy = shutil.copy2(filename,saveas) ################### Begin Class MixingRatioSample ###################
msg = 'Successfully copied file (%s) to (%s) ' % (filename,saveas) ; logging.info(msg) class MixingRatioSample():
"""
Holds the data that defines a Mixing Ratio Sample in the data assimilation framework. Sor far, this includes all
attributes listed below in the __init__ method. One can additionally make more types of data, or make new
objects for specific projects.
rc_da_shell['obs.full.filename'] = saveas """
return rc_da_shell def __init__(self,xdate,code='XXX',obs=0.0,simulated=0.0,resid=0.0,hphr=0.0,mdm=0.0,flag=0,height=0.0,lat=-999.,lon=-999.,evn='0000',sdev=0.0):
self.code = code.strip() # Site code
self.xdate = xdate # Date of obs
self.obs = obs # Value observed
self.simulated = simulated # Value simulated by model
self.resid = resid # Mixing ratio residuals
self.hphr = hphr # Mixing ratio prior uncertainty from fluxes and (HPH) and model data mismatch (R)
self.mdm = mdm # Model data mismatch
self.flag = flag # Flag
self.height = height # Sample height
self.lat = lat # Sample lat
self.lon = lon # Sample lon
self.id = evn # Event number
self.sdev = sdev # standard deviation of ensemble
self.masl = True # Sample is in Meters Above Sea Level
self.mag = not self.masl # Sample is in Meters Above Ground
def __str__(self):
day=self.xdate.strftime('%Y-%m-%d %H:%M:%S')
return ' '.join(map(str,[self.code,day,self.obs,self.flag,self.id]))
################### End Class MixingRatioSample ###################
################### Begin Class MixingRatioList ###################
class MixingRatioList(list):
""" This is a special type of list that holds MixingRatioSample objects. It has methods to extract data from such a list easily """
from numpy import array
def getvalues(self,name,constructor=array):
return constructor([getattr(o,name) for o in self])
def unflagged(self):
return MixingRatioSample([o for o in self if o.flag == 0])
def flagged(self):
return MixingRatioSample([o for o in self if o.flag != 0])
def selectsite(self,site='mlo'):
l=[o for o in self if o.code == site]
return MixingRatioSample(l)
def selecthours(self,hours=range(1,24)):
l=[o for o in self if o.xdate.hour in hours]
return MixingRatioSample(l)
################### End Class MixingRatioList ###################
################### Begin Class CtObservations ###################
class CtObservations():
""" an object that holds data + methods and attributes needed to manipulate mixing ratio values """
def __init__(self,DaInfo):
sfname = DaInfo.da_settings['obs.input.fname']+'.%s'%('20050305')+'.nc'
msg = 'This filename is hardcoded but needs replacement' ; logging.warning(msg)
filename = os.path.join(DaInfo.da_settings['obs.input.dir'],sfname)
if not os.path.exists(filename):
msg = 'Could not find the required observation input file (%s) ' % filename ; logging.error(msg)
raise IOError,msg
else:
self.ObsFilename = filename
def PrepareEnsemble(rc_da_shell ): self.MrData = None
"""
Prepare the ensemble of parameters needed by the forecast model. There are two possible pathways:
(A) Construct a brand new ensemble from a covariance matrix def __str__(self):
""" Prints a list of MixingRatioSample objects"""
return self.ObsFilename
(B) Get an existing ensemble and propagate it def AddObs(self):
""" Returns a MixingRatioList holding individual MixingRatioSample objects for all obs in a file"""
import pyhdf.SD as HDF
import datetime as dtm
from string import strip, join
ncf = HDF.SD(self.ObsFilename)
idates = ncf.select('date_components').get()
ids = ncf.select('id').get()
sites = ncf.select('site').get()
sites = [join(s,'').lower() for s in sites]
sites = map(strip,sites)
lats = ncf.select('lat').get()
lons = ncf.select('lon').get()
alts = ncf.select('alt').get()
obs = ncf.select('obs').get()
species = ncf.select('species').get()
date = ncf.select('date').get()
window = ncf.select('sampling_strategy').get()
flags = ncf.select('NOAA_QC_flags').get()
flags = [join(s,'') for s in flags]
flags = map(strip,flags)
dummy = ncf.end()
msg = "Successfully read data from obs file (%s)"%self.ObsFilename ; logging.debug(msg)
The steps for procedure A are: dates = [dtm.datetime(*d) for d in idates]
if self.MrData == None:
self.MrData = MixingRatioList([])
(A1) Construct the covariance matrix for n in range(len(dates)):
(A2) Make a Cholesky decomposition self.MrData.append( MixingRatioSample(dates[n],sites[n],obs[n],0.0,0.0,0.0,0.0,flags[n],alts[n],lats[n],lons[n],ids[n],0.0) )
(A3) Prepare a set of [nmembers] input parameters for each forecast member
The steps for procedure B are: msg = "Added %d observations to the MrData list" % len(dates) ; logging.info(msg)
(B1) Get existing parameter values for each member def AddSimulations(self, filename):
(B2) Run them through a forecast model """ Adds model simulated values to the mixing ratio objects """
import pyhdf.SD as HDF
Both procedures finish with writing the parameter values to a NetCDF file ncf = HDF.SD(filename)
ids = ncf.select('id').get()
simulated = ncf.select('sampled_mole_frac').get()*1e6
dummy = ncf.end()
msg = "Successfully read data from model sample file (%s)"%filename ; logging.debug(msg)
obs_ids = self.MrData.getvalues('id')
for n in range(len(ids)):
try:
index = obs_ids.index(ids[n])
except:
msg = 'A model sample was found that did not match any ID in the existing observation list. Skipping...' ; logging.warning(msg)
continue
dummy = self.MrData[index].simulated = simulated[n]
msg = "Added %d simulated values to the MrData list" % len(ids) ; logging.info(msg)
################### End Class CtObservations ###################
################### Begin Class CtMember ###################
class CtMember():
"""
An ensemble member for CarbonTracker. This consists of
* parameter values
* a CtObservations object with sampled mixing ratios
* file names to write/read the data for this member
""" """
# Get the da system specific rc-items, note that they've been validated already by FillObs def __init__(self, membernumber):
self.membernumber = membernumber # the member number
self.randomstate = (None,) # the state of the random number generator needed to reproduce the random sequence
self.ModelSamples = None # CtObservations object that holds model samples for this member
self.ParameterValues = None # Parameter values of this member
self.ParameterOutputFile = None # File to which the member info should be written
self.SampleInputFile = None # File from which the sample info can be read
rc_da_system = rc.read(rc_da_shell['da.system.rc'],silent = True) def __str__(self):
return "%03d"%self.membernumber
nlag = int(rc_da_shell['time.nlag'])
for n in range(nlag): ################### End Class CtMember ###################
if n == nlag: ################### Begin Class CtStateVector ###################
# Route A: make a new ensemble class CtStateVector():
""" an object that holds data + methods and attributes needed to manipulate state vector values """
def __init__(self,CycleInfo,DaInfo):
Xprime,Qprior = MakeNewEnsemble(rc_da_shell, rc_da_system) self.CycleInfo = CycleInfo
self.DaInfo = DaInfo
self.nlag = int(CycleInfo.da_settings['time.nlag'])
self.nmembers = int(CycleInfo.da_settings['forecast.nmembers'])
self.nparams = int(DaInfo.da_settings['nparameters'])
self.isOptimized = False
else: self.EnsembleMembers = range(self.nlag)
for n in range(self.nlag):
self.EnsembleMembers[n] = []
# Route B: Propagate existing ensemble def __str__(self):
return "This is a CarbonTracker state vector object"
Xprime,Qprior = [0,0] def MakeNewEnsemble(self,nlag):
""" Make a new ensemble from specified matrices """
import pyhdf.SD as hdf
import numpy as np
import matplotlib.pyplot as plt
return Xprime, Qprior # Get the needed matrices from the specified covariance files
file_ocn_cov = self.DaInfo.da_settings['ocn.covariance']
file_bio_cov = self.DaInfo.da_settings['bio.covariance']
def MakeNewEnsemble(rc_da_shell, rc_da_system): for file in [file_ocn_cov,file_bio_cov]:
""" Make a new ensemble from specified matrices """
import pyhdf.SD as hdf if not os.path.exists(file):
import numpy as np
import matplotlib.pyplot as plt
# Get the needed matrices from the specified covariance files msg = "Cannot find the specified file %s" % file ; logging.error(msg)
raise IOError,msg
else:
file_ocn_cov = rc_da_system['ocn.covariance'] msg = "Using covariance file: %s" % file ; logging.info(msg)
file_bio_cov = rc_da_system['bio.covariance']
for file in [file_ocn_cov,file_bio_cov]: f_ocn = hdf.SD(file_ocn_cov)
f_bio = hdf.SD(file_bio_cov)
if not os.path.exists(file): cov_ocn = f_ocn.select('qprior').get()
cov_bio = f_bio.select('qprior').get()
msg = "Cannot find the specified file %s" % file ; logging.error(msg) dummy = f_ocn.end()
raise IOError,msg dummy = f_bio.end()
else:
dummy = logging.debug("Succesfully closed files after retrieving prior covariance matrices")
# Once we have the matrices, we can start to make the full covariance matrix, and then decompose it
fullcov = np.zeros((self.nparams,self.nparams),float)
nocn = cov_ocn.shape[0]
nbio = cov_bio.shape[0]
fullcov[0:nbio,0:nbio] = cov_bio
fullcov[nbio:nbio+nocn,nbio:nbio+nocn] = cov_ocn
# Visually inspect full covariance matrix if verbose
if self.CycleInfo.da_settings['verbose']:
plt.imshow(fullcov)
plt.colorbar()
plt.savefig('fullcovariancematrix.png')
plt.close('all')
# Make a cholesky decomposition of the covariance matrix
U,s,Vh = np.linalg.svd(fullcov)
dof = np.sum(s)**2/sum(s**2)
C = np.linalg.cholesky(fullcov)
msg = 'Cholesky decomposition has succeeded offline' ; logging.info(msg)
msg = 'Appr. degrees of freedom in covariance matrix is %s'%(int(dof)) ; logging.info(msg)
if self.CycleInfo.da_settings['verbose']:
plt.plot(s)
plt.savefig('eigenvaluespectrum.png')
plt.close('all')
# Now create a set of random instances of the given covariance structure
dummy = np.random.seed()
# mean does not hold data yet
NewMember = CtMember(0)
NewMember.ParameterValues = np.ones((self.nparams,self.nmembers),float)
dummy = self.EnsembleMembers[nlag].append(NewMember)
msg = "Using covariance file: %s" % file ; logging.info(msg) # Create members 1:nmembers and add to EnsembleMembers list
f_ocn = hdf.SD(file_ocn_cov) for member in range(1,self.nmembers):
f_bio = hdf.SD(file_bio_cov) randstate = np.random.get_state()
rands = np.random.randn(self.nparams)
cov_ocn = f_ocn.select('qprior').get() NewMember = CtMember(member)
cov_bio = f_bio.select('qprior').get() NewMember.ParameterValues = np.dot(C,rands)
NewMember.randomstate = randstate
dummy = self.EnsembleMembers[nlag].append(NewMember)
dummy = f_ocn.end() msg = '%d new ensemble members were added to the state vector'%(self.nmembers) ; logging.info(msg)
dummy = f_bio.end()
dummy = logging.debug("Succesfully closed files after retrieving prior covariance matrices")
# Once we have the matrices, we can start to make the full covariance matrix, and then decompose it ################### End Class CtStateVector ###################
nparams = int(rc_da_system['nparameters']) ################### Begin Class CtOptimizer ###################
nmembers = int(rc_da_shell['forecast.nmembers'])
fullcov = np.zeros((nparams,nparams),float) class CtOptimizer():
"""
This creates an instance of a CarbonTracker optimization object. It handles the minimum least squares optimization
of the CT state vector given a set of CT sample objects. Two routines will be implemented: one where the optimization
is sequential and one where it is the equivalent matrix solution. The choice can be made based on considerations of speed
and efficiency.
"""
nocn = cov_ocn.shape[0] def __init__(self, CycleInfo, DaInfo):
nbio = cov_bio.shape[0]
fullcov[0:nbio,0:nbio] = cov_bio self.nlag = int(CycleInfo.da_settings['time.nlag'])
fullcov[nbio:nbio+nocn,nbio:nbio+nocn] = cov_ocn self.nmembers = int(CycleInfo.da_settings['forecast.nmembers'])
self.nparams = int(DaInfo.da_settings['nparameters'])
# Visually inspect full covariance matrix if verbose self.CreateMatrices()
if rc_da_shell['verbose']: return None
plt.imshow(fullcov) def CreateMatrices(self):
plt.colorbar() """ Create Matrix space needed in optimization routine """
plt.savefig('fullcovariancematrix.png') import numpy as np
plt.close('all')
# Make a cholesky decomposition of the covariance matrix self.X_prior = np.array( (self.nlag*self.nparams,), 'float')
self.X_prior_prime = np.array( (self.nmembers,self.nlag*self.nparams,), 'float')
U,s,Vh = np.linalg.svd(fullcov) def FillMatrices(self,StateVector):
dof = np.sum(s)**2/sum(s**2)
C = np.linalg.cholesky(fullcov)
return None
msg = 'Cholesky decomposition has succeeded offline' ; logging.info(msg)
msg = 'Appr. degrees of freedom in covariance matrix is %s'%(int(dof)) ; logging.info(msg)
if rc_da_shell['verbose']: ################### End Class CtOptimizer ###################
plt.plot(s)
plt.savefig('eigenvaluespectrum.png')
plt.close('all')
# Now create a set of random instances of the given covariance structure ################# Stand alone methods that manipulate the objects above ##############
# First see the random number generator with a fixed value from the rc-file, this is to make reproducible results def PrepareObs(CycleInfo,type='forecast'):
""" PrepareObs """
dummy = np.random.seed(int(rc_da_system['random.seed'])) import shutil
# Structure to hold the data # First, we make a CarbonTracker Observation object which holds all the information related to the mixing ratios going in
# and coming out of the model. This object also holds the observations themselves, and the simulated value + statistics
parameter_deviations = np.zeros((nparams,nmembers),float) # This is array X' DaSystem = DaInfo(CycleInfo.da_settings['da.system.rc'])
for member in range(nmembers): samples = CtObservations(DaSystem)
rands = np.random.randn(nparams)
parameter_deviations[:,member] = np.dot(C,rands)
if rc_da_shell['verbose']: dummy = samples.AddObs()
plt.hist(parameter_deviations[20,:],20) #
plt.savefig('par20histogram.png') # Next should follow a section where the observations that are read are further processes, and written to a NetCDF file for
plt.close('all') # the transport model to use for x,y,z,t smapling.
#
# For carbontracker the observations already come in netcdf format thanks to Andy and Ken. We will therefore simply copy
# the observations.nc file to the pre-cooked obs files
#
return parameter_deviations, fullcov filename= samples.ObsFilename
saveas = os.path.join(CycleInfo.da_settings['dir.input'],'observations.nc')
dummy = shutil.copy2(filename,saveas)
msg = 'Successfully copied file (%s) to (%s) ' % (filename,saveas) ; logging.info(msg)
return samples
def PrepareState(CycleInfo):
"""
Prepare the ensemble of parameters needed by the forecast model. There are two possible pathways:
(A) Construct a brand new ensemble from a covariance matrix
(B) Get an existing ensemble and propagate it