obs.py 21.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
#!/usr/bin/env python
# obs.py

"""
Author : peters 

Revision History:
File created on 28 Jul 2010.
Adapted by super004 on 18 May 2017.

"""
import os
import sys
import logging
        
import datetime as dtm
from string import strip
from numpy import array, logical_and
import numpy as np
sys.path.append(os.getcwd())
sys.path.append('../../')
from pylab import *

identifier = 'Rotterdam CO2 mole fractions'
version = '0.0'

from da.baseclasses.obs import Observations
import da.tools.io4 as io
import da.tools.rc as rc
################### Begin Class RdamObservations ###################

class RdamObservations(Observations):
    """ an object that holds data + methods and attributes needed to manipulate mole fraction values """

    def setup(self, dacycle):

        self.startdate = dacycle['time.sample.start']
        self.enddate = dacycle['time.sample.end']

        op_id = dacycle.dasystem['obs.input.id']
        op_dir = dacycle.dasystem['datadir']
        self.nrloc = dacycle.dasystem['obs.input.nr']

        if not os.path.exists(op_dir):
            msg = 'Could not find  the required ObsPack distribution (%s) ' % op_dir
            logging.error(msg)
            raise IOError, msg
        else:
            self.obspack_dir = op_dir
            self.obspack_id = op_id

        self.datalist = []
        self.tracer_list = []
        # self.cnt = []

    def add_observations(self, dacycle):
        """ Returns a MoleFractionList holding individual MoleFractionSample objects for all obs in a file
      
            The ObsPack mole fraction files are provided as time series per site with all dates in sequence. 
            We will loop over all site files in the ObsPackage, and subset each to our needs
            
        """

        infile = os.path.join(self.obspack_dir, self.obspack_id)
        f = open(infile, 'r')
        lines = f.readlines()
        f.close()

        ncfilelist = []
        for line in lines:
            if line.startswith('#'): continue # header

            dum = line.split(",")
            ncfile = [dum[1]]

            if not ncfile[0] in ncfilelist:
                ncfilelist.extend(ncfile)
        
        for ncfile in ncfilelist:
            infile = os.path.join(self.obspack_dir, ncfile + '.nc')
            ncf = io.ct_read(infile, 'read')
            idates = ncf.get_variable('Times')
            dates = array([dtm.datetime(int(''.join(d[:4])),int(''.join(d[5:7])),int(''.join(d[8:10])),int(''.join(d[11:13])),0) for d in idates])
            
            subselect = logical_and(dates >= self.startdate , dates <= self.enddate).nonzero()[0]

            dates = dates.take(subselect, axis=0)
            
            datasetname = ncfile  # use full name of dataset to propagate for clarity
            logging.info(datasetname)
            lats = ncf.get_variable('lat').take(subselect, axis=0)
            lons = ncf.get_variable('lon').take(subselect, axis=0)
            alts = ncf.get_variable('alt').take(subselect, axis=0)
            obs = ncf.get_variable('obs').take(subselect, axis=0)
            ids = ncf.get_variable('ids').take(subselect, axis=0)
            spcs = ncf.get_variable('species').take(subselect, axis=0)
            ncf.close()
            for n in range(len(dates)):
                spc = spcs[n,0]+spcs[n,1]+spcs[n,2]
                self.datalist.append(MoleFractionSample(ids[n], dates[n], datasetname, obs[n], 0.0, 0.0, 0.0, 0.0, 0, alts[n], lats[n], lons[n], '001', spc, 1, 0.0, infile))
            
            logging.debug("Added %d observations from file (%s) to the Data list" % (len(dates), datasetname)) 

        logging.info("Observations list now holds %d values" % len(self.datalist))

    def add_simulations(self, filename, silent=False):
        """ Adds model simulated values to the mole fraction objects """

        if not os.path.exists(filename):
            msg = "Sample output filename for observations could not be found : %s" % filename 
            logging.error(msg)
            logging.error("Did the sampling step succeed?")
            logging.error("...exiting")
            raise IOError, msg

        ncf = io.ct_read(filename, method='read')
        ids = ncf.get_variable('obs_num')
        simulated = ncf.get_variable('model')
        ncf.close()
        logging.info("Successfully read data from model sample file (%s)" % filename)

        obs_ids = self.getvalues('id').tolist()
        ids = map(int, ids)

        missing_samples = []

        for idx, val in zip(ids, simulated): 
            if idx in obs_ids:
                index = obs_ids.index(idx)
                self.datalist[index].simulated = val  # in mol/mol
            else:     
                missing_samples.append(idx)

        if not silent and missing_samples != []:
            logging.warning('Model samples were found that did not match any ID in the observation list. Skipping them...')

        logging.debug("Added %d simulated values to the Data list" % (len(ids) - len(missing_samples)))
    

    def write_sample_coords(self, obsinputfile):
        """ 
            Write the information needed by the observation operator to a file. Return the filename that was written for later use

        """

        if len(self.datalist) == 0:
            logging.debug("No observations found for this time period, nothing written to obs file")
        else:
            f = io.CT_CDF(obsinputfile, method='create')
            logging.debug('Creating new observations file for ObservationOperator (%s)' % obsinputfile)

            dimid = f.add_dim('obs', len(self.datalist))
            dim200char = f.add_dim('string_of200chars', 200)
            dim10char = f.add_dim('string_of10chars', 10)
            dimcalcomp = f.add_dim('calendar_components', 6)

            data = self.getvalues('id')

            savedict = io.std_savedict.copy() 
            savedict['name'] = "obs_num"
            savedict['dtype'] = "int"
            savedict['long_name'] = "Unique_Dataset_observation_index_number"
            savedict['units'] = ""
            savedict['dims'] = dimid
            savedict['values'] = data.tolist()
            savedict['comment'] = "Unique index number within this dataset ranging from 0 to UNLIMITED."
            f.add_data(savedict)

            data = [[d.year, d.month, d.day, d.hour, d.minute, d.second] for d in self.getvalues('xdate') ]

            savedict = io.std_savedict.copy() 
            savedict['dtype'] = "int"
            savedict['name'] = "date_components"
            savedict['units'] = "integer components of UTC date/time"
            savedict['dims'] = dimid + dimcalcomp
            savedict['values'] = data
            savedict['missing_value'] = -9
            savedict['comment'] = "Calendar date components as integers. Times and dates are UTC." 
            savedict['order'] = "year, month, day, hour, minute, second"
            f.add_data(savedict)

            data = self.getvalues('lat')

            savedict = io.std_savedict.copy() 
            savedict['name'] = "latitude"
            savedict['units'] = "degrees_north"
            savedict['dims'] = dimid
            savedict['values'] = data.tolist()
            savedict['missing_value'] = -999.9
            f.add_data(savedict)

            data = self.getvalues('lon')

            savedict = io.std_savedict.copy() 
            savedict['name'] = "longitude"
            savedict['units'] = "degrees_east"
            savedict['dims'] = dimid
            savedict['values'] = data.tolist()
            savedict['missing_value'] = -999.9
            f.add_data(savedict)

            data = self.getvalues('height')

            savedict = io.std_savedict.copy() 
            savedict['name'] = "altitude"
            savedict['units'] = "meters_above_sea_level"
            savedict['dims'] = dimid
            savedict['values'] = data.tolist()
            savedict['missing_value'] = -999.9
            f.add_data(savedict)

            data = np.array(self.tracer_list)

            savedict = io.std_savedict.copy() 
            savedict['dtype'] = "int"
            savedict['name'] = "source_type"
            savedict['units'] = "NA"
            savedict['dims'] = dimid
            savedict['values'] = data.tolist()
            savedict['missing_value'] = -9
            f.add_data(savedict)

            data = self.getvalues('evn')

            savedict = io.std_savedict.copy() 
            savedict['dtype'] = "char"
            savedict['name'] = "obs_id"
            savedict['units'] = "ObsPack datapoint identifier"
            savedict['dims'] = dimid + dim200char
            savedict['values'] = data
            savedict['missing_value'] = '!'
            f.add_data(savedict)

	    data = self.getvalues('obs')

	    savedict = io.std_savedict.copy()
	    savedict['name'] = "observed"
	    savedict['long_name'] = "observedvalues"
	    savedict['units'] = "mol mol-1"
	    savedict['dims'] = dimid
	    savedict['values'] = data.tolist()
	    savedict['comment'] = 'Observations used in optimization'
	    f.add_data(savedict)
    
	    data = self.getvalues('mdm')
    
	    savedict = io.std_savedict.copy()
	    savedict['name'] = "modeldatamismatch"
	    savedict['long_name'] = "modeldatamismatch"
	    savedict['units'] = "[mol mol-1]"
	    savedict['dims'] = dimid
	    savedict['values'] = data.tolist()
	    savedict['comment'] = 'Standard deviation of mole fractions resulting from model-data mismatch'
	    f.add_data(savedict)
            f.close()

            logging.debug("Successfully wrote data to obs file")
            logging.info("Sample input file for obs operator now in place [%s]" % obsinputfile)        

    def add_model_data_mismatch(self, filename):
        """ 
            Get the model-data mismatch values for this cycle.

                (1) Open a sites_weights file
                (2) Parse the data
                (3) Compare site list against data
                (4) Take care of double sites, etc

        """    

        if not os.path.exists(filename):
            msg = 'Could not find  the required sites.rc input file (%s) ' % filename
            logging.error(msg)
            raise IOError, msg
        else:
            self.sites_file = filename

        sites_weights = rc.read(self.sites_file)

        self.rejection_threshold = int(sites_weights['obs.rejection.threshold'])
        self.global_R_scaling = float(sites_weights['global.R.scaling'])
        self.n_site_categories = int(sites_weights['n.site.categories'])

        logging.debug('Model-data mismatch rejection threshold: %d ' % self.rejection_threshold)
        logging.warning('Model-data mismatch scaling factor     : %f ' % self.global_R_scaling)
        logging.debug('Model-data mismatch site categories    : %d ' % self.n_site_categories)
   
        cats = [k for k in sites_weights.keys() if 'site.category' in k] 

        site_categories = {}
        for key in cats:
            name, error, may_localize, may_reject = sites_weights[key].split(';')
            name = name.strip().lower()
            error = float(error)
            may_reject = ("TRUE" in may_reject.upper())
            may_localize = ("TRUE" in may_localize.upper())
            site_categories[name] = {'category': name, 'error': error, 'may_localize': may_localize, 'may_reject': may_reject}

        site_info = {}
        site_move = {}
        site_hourly = {}   # option added to include only certain hours of the day (for e.g. PAL) IvdL
        site_incalt = {} # option to increase sampling altitude for sites specified in sites and weights file 
        for key, value in sites_weights.iteritems():
            if 'obsfile' in key:  # to be fixed later, do not yet know how to parse valid keys from rc-files yet.... WP
                sitename, sitecategory = key, value
                sitename = sitename.strip()
                sitecategory = sitecategory.split()[0].strip().lower()
                site_info[sitename] = site_categories[sitecategory]
            if 'site.move' in key:
                identifier, latmove, lonmove = value.split(';')
                site_move[identifier.strip()] = (float(latmove), float(lonmove))
            if 'site.hourly' in key:
                identifier, hourfrom, hourto = value.split(';')
                site_hourly[identifier.strip()] = (int(hourfrom), int(hourto))
            if 'site.incalt' in key:
                identifier, incalt = value.split(';')
                site_incalt[identifier.strip()] = (int(incalt))

        for obs in self.datalist:  # loop over all available data points

            obs.mdm = 1000.0  # default is very high model-data-mismatch, until explicitly set by script
            obs.flag = 99  # default is do-not-use , until explicitly set by script
            exclude_hourly = False # default is that hourly values are not included

            identifier = obs.code
            # species, site, method, lab, datasetnr = identifier.split('_')

            if site_info.has_key(identifier):
                if site_hourly.has_key(identifier):
                    obs.samplingstrategy = 2
                    hourf, hourt = site_hourly[identifier]
                    if int(obs.xdate.hour) >= hourf and int(obs.xdate.hour) <= hourt:
                        logging.warning("Observations in hourly dataset INCLUDED, while sampling time %s was between %s:00-%s:00"%(obs.xdate.time(),hourf,hourt))
                        obs.flag = 0
                    else:
                        logging.warning("Observation in hourly dataset EXCLUDED, while sampling time %s was outside %s:00-%s:00"%(obs.xdate.time(),hourf,hourt))
                        exclude_hourly = True
                if site_info[identifier]['category'] == 'do-not-use' or exclude_hourly:
                    logging.warning("Observation found (%s, %d), but not used in assimilation !!!" % (identifier, obs.id))
                    obs.mdm = site_info[identifier]['error'] * self.global_R_scaling
                    obs.may_localize = site_info[identifier]['may_localize']
                    obs.may_reject = site_info[identifier]['may_reject']
                    obs.flag = 99
                else:
                    logging.debug("Observation found (%s, %d)" % (identifier, obs.id))
                    obs.mdm = site_info[identifier]['error'] * self.global_R_scaling
                    obs.may_localize = site_info[identifier]['may_localize']
                    obs.may_reject = site_info[identifier]['may_reject']
                    obs.flag = 0

            else:
                logging.warning("Observation NOT found (%s, %d), please check sites.rc file (%s)  !!!" % (identifier, obs.id, self.sites_file))

            if site_move.has_key(identifier):

                movelat, movelon = site_move[identifier]
                obs.lat = obs.lat + movelat
                obs.lon = obs.lon + movelon

                logging.warning("Observation location for (%s, %d), is moved by %3.2f degrees latitude and %3.2f degrees longitude" % (identifier, obs.id, movelat, movelon))

            if site_incalt.has_key(identifier):

                incalt = site_incalt[identifier]
                obs.height = obs.height + incalt

                logging.warning("Observation location for (%s, %d), is moved by %3.2f meters in altitude" % (identifier, obs.id, incalt))


        # Add site_info dictionary to the Observations object for future use

        self.site_info = site_info
        self.site_move = site_move
        self.site_hourly = site_hourly
        self.site_incalt = site_incalt

        logging.debug("Added Model Data Mismatch to all samples ")

    def write_sample_auxiliary(self, auxoutputfile, filename):
        """ 
            Write selected information contained in the Observations object to a file. 

        """
        
        if not os.path.exists(filename):
            msg = "Sample output filename for observations could not be found : %s" % filename 
            logging.error(msg)
            logging.error("Did the sampling step succeed?")
            logging.error("...exiting")
            raise IOError, msg

        ncf = io.ct_read(filename, method='read')
        ids = ncf.get_variable('obs_num')
        simulatedensemble = ncf.get_variable('model')
        ncf.close()
        logging.info("Successfully read data from model sample file (%s)" % filename)

        f = io.CT_CDF(auxoutputfile, method='create')
        logging.debug('Creating new auxiliary sample output file for postprocessing (%s)' % auxoutputfile)

        dimid = f.add_dim('obs', len(self.datalist))
        dim200char = f.add_dim('string_of200chars', 200)
        dim10char = f.add_dim('string_of10chars', 10)
        dimcalcomp = f.add_dim('calendar_components', 6)

        if len(self.datalist) == 0:
            f.close()
            #return outfile

        for key, value in self.site_move.iteritems():
            msg = "Site is moved by %3.2f degrees latitude and %3.2f degrees longitude" % value 
            f.add_attribute(key, msg)

        data = self.getvalues('id')

        savedict = io.std_savedict.copy() 
        savedict['name'] = "obs_num"
        savedict['dtype'] = "int"
        savedict['long_name'] = "Unique_Dataset_observation_index_number"
        savedict['units'] = ""
        savedict['dims'] = dimid
        savedict['values'] = data.tolist()
        savedict['comment'] = "Unique index number within this dataset ranging from 0 to UNLIMITED."
        f.add_data(savedict)

        data = [[d.year, d.month, d.day, d.hour, d.minute, d.second] for d in self.getvalues('xdate')]

        savedict = io.std_savedict.copy() 
        savedict['dtype'] = "int"
        savedict['name'] = "date_components"
        savedict['units'] = "integer components of UTC date/time"
        savedict['dims'] = dimid + dimcalcomp
        savedict['values'] = data
        savedict['missing_value'] = -9
        savedict['comment'] = "Calendar date components as integers. Times and dates are UTC." 
        savedict['order'] = "year, month, day, hour, minute, second"
        f.add_data(savedict)

        data = self.getvalues('obs')

        savedict = io.std_savedict.copy()
        savedict['name'] = "observed"
        savedict['long_name'] = "observedvalues"
        savedict['units'] = "mol mol-1"
        savedict['dims'] = dimid
        savedict['values'] = data.tolist()
        savedict['comment'] = 'Observations used in optimization'
        f.add_data(savedict)

        data = self.getvalues('mdm')

        savedict = io.std_savedict.copy()
        savedict['name'] = "modeldatamismatch"
        savedict['long_name'] = "modeldatamismatch"
        savedict['units'] = "[mol mol-1]"
        savedict['dims'] = dimid
        savedict['values'] = data.tolist()
        savedict['comment'] = 'Standard deviation of mole fractions resulting from model-data mismatch'
        f.add_data(savedict)

        data = simulatedensemble

        dimmembers = f.add_dim('members', data.shape[1])

        savedict = io.std_savedict.copy()
        savedict['name'] = "modelsamples"
        savedict['long_name'] = "modelsamples for all ensemble members"
        savedict['units'] = "mol mol-1"
        savedict['dims'] = dimid + dimmembers
        savedict['values'] = data.tolist()
        savedict['comment'] = 'simulated mole fractions based on optimized state vector'
        f.add_data(savedict)

        data = self.getvalues('fromfile') 

        savedict = io.std_savedict.copy()
        savedict['name'] = "inputfilename"
        savedict['long_name'] = "name of file where original obs data was taken from"
        savedict['dtype'] = "char"
        savedict['dims'] = dimid + dim200char
        savedict['values'] = data
        savedict['missing_value'] = '!'
        f.add_data(savedict)

        f.close()

        logging.debug("Successfully wrote data to auxiliary sample output file (%s)" % auxoutputfile)

        #return outfile



################### End Class CtObservations ###################



################### Begin Class MoleFractionSample ###################

class MoleFractionSample(object):
    """ 
        Holds the data that defines a mole fraction Sample in the data assimilation framework. Sor far, this includes all
        attributes listed below in the __init__ method. One can additionally make more types of data, or make new
        objects for specific projects.

    """

    def __init__(self, idx, xdate, code='XXX', obs=0.0, simulated=0.0, resid=0.0, hphr=0.0, mdm=0.0, flag=0, height=0.0, lat= -999., lon= -999., evn='0000', species='co2', samplingstrategy=1, sdev=0.0, fromfile='none.nc'):
        self.code = code.strip()      # dataset identifier, i.e., co2_lef_tower_insitu_1_99
        self.xdate = xdate             # Date of obs
        self.obs = obs               # Value observed
        self.simulated = simulated         # Value simulated by model
        self.resid = resid             # Mole fraction residuals
        self.hphr = hphr              # Mole fraction prior uncertainty from fluxes and (HPH) and model data mismatch (R)
        self.mdm = mdm               # Model data mismatch
        self.may_localize = True           # Whether sample may be localized in optimizer
        self.may_reject = True              # Whether sample may be rejected if outside threshold
        self.flag = flag              # Flag
        self.height = height            # Sample height in masl
        self.lat = lat               # Sample lat
        self.lon = lon               # Sample lon
        self.id = idx               # Obspack ID within distrution (integer), e.g., 82536
        self.evn = evn               # Obspack Number within distrution (string), e.g., obspack_co2_1_PROTOTYPE_v0.9.2_2012-07-26_99_82536
        self.sdev = sdev              # standard deviation of ensemble
        self.masl = True              # Sample is in Meters Above Sea Level
        self.mag = not self.masl     # Sample is in Meters Above Ground
        self.species = species.strip()
        self.samplingstrategy = samplingstrategy
        self.fromfile = fromfile   # netcdf filename inside ObsPack distribution, to write back later

################### End Class MoleFractionSample ###################


if __name__ == "__main__":
    pass