summarize_obs.py

#!/usr/bin/env python
import sys
sys.path.append('../../')
import os
import numpy as np
import string
import datetime as dt
import logging
import re
import da.tools.io4 as io

fontsize = 10

def nice_lat(cls):
    #
    # Convert latitude from decimal to cardinal
    #
    if cls > 0:
        h = 'N'
    else:
        h = 'S'

    dec, deg = np.math.modf(cls)

    return string.strip('%2d %2d\'%s' % (abs(deg), round(abs(60 * dec), 0), h))

def nice_lon(cls):
    #
    # Convert longitude from decimal to cardinal
    #
    if cls > 0:
        h = 'E'
    else:
        h = 'W'

    dec, deg = np.math.modf(cls)

    return string.strip('%3d %2d\'%s' % (abs(deg), round(abs(60 * dec), 0), h))

def nice_alt(cls):
    #
    # Reformat elevation or altitude
    #
    return string.strip('%10.1f masl' % round(cls, -1))


def summarize_obs(dacycle, printfmt='html'):
    """***************************************************************************************
    Call example:

    python summarize_obs.py

    Option printfmt    : [tex,scr,html] print summary table in latex, terminal, or html format

    Other options are all those needed to create a dacycle object

    OR:

    call directly from a python script as:

    q=summarize_obs(dacycle,printfmt='html')

    ***************************************************************************************"""

    sumdir = os.path.join(dacycle['dir.analysis'], 'summary')
    if not os.path.exists(sumdir):
        logging.info("Creating new directory " + sumdir)
        os.makedirs(sumdir)

    mrdir = os.path.join(dacycle['dir.analysis'], 'data_molefractions')
    if not os.path.exists(mrdir):
        logging.error("Input directory does not exist (%s), exiting... " % mrdir)
        return None

    mrfiles = os.listdir(mrdir)
    infiles = [os.path.join(mrdir, f) for f in mrfiles if f.endswith('.nc')]

    if printfmt == 'tex':
        print '\\begin{tabular*}{\\textheight}{l l l l r r r r}'
        print 'Code &  Name & Lat, Lon, Elev & Lab &  N (flagged) & $\\sqrt{R}$  &Inn \\XS &Bias\\\\'
        print '\hline\\\\ \n\multicolumn{8}{ c }{Semi-Continuous Surface Samples}\\\\[3pt] '
        fmt = '%8s  & ' + ' %55s  & ' + '%20s &' + '%6s &' + ' %4d (%d)  & ' + ' %5.2f  & ' + ' %5.2f & ' + '%+5.2f  \\\\'
    elif printfmt == 'html':
        tablehead = \
              "<TR>\n <TH> Site code </TH> \
                   <TH> Sampling Type </TH> \
                   <TH> Lab. </TH> \
                   <TH> Country </TH> \
                   <TH> Lat, Lon, Elev. (m ASL) </TH> \
                   <TH> No. Obs. Avail. </TH>  \
                   <TH> &#8730;R (&mu;mol mol<sup>-1</sup>) </TH>  \
                   <TH> &#8730;HPH  (&mu;mol mol<sup>-1</sup>) </TH> \
                   <TH> H(x)-y (&mu;mol mol<sup>-1</sup>) </TH> \
                   <TH> H(x)-y (JJAS) (&mu;mol mol<sup>-1</sup>) </TH>  \
                   <TH> H(x)-y (NDJFMA) (&mu;mol mol<sup>-1</sup>) </TH> \n \
               </TR>\n"

        fmt = """<TR> \n \
                <TD><a href='javascript:LoadCO2Tseries("%s")'>%s </a></TD>\
                <TD>%s</TD>\
                <TD>%s</TD>\
                <TD>%40s</TD>\
                <TD>%s</TD>\
                <TD>%d</TD>\
                <TD>%+5.2f</TD>\
                <TD>%+5.2f</TD>\
                <TD>%+5.2f&plusmn;%5.2f</TD>\
                <TD>%+5.2f&plusmn;%5.2f</TD>\
                <TD>%+5.2f&plusmn;%5.2f</TD>\n \
               </TR>\n"""
    elif printfmt == 'scr':
        print 'Code   Site     NObs flagged  R  Inn X2'
        fmt = '%8s ' + ' %55s  %s %s' + ' %4d ' + ' %4d ' + ' %5.2f ' + ' %5.2f'

    table = []

    for infile in infiles:

            #logging.debug( infile )
            f = io.CT_CDF(infile, 'read')
            date = f.get_variable('time')
            obs = f.get_variable('value') * 1e6
            mdm = f.get_variable('modeldatamismatch') * 1e6
            simulated = f.get_variable('modelsamplesmean_forecast') * 1e6
            simulated_std = f.get_variable('modelsamplesstandarddeviation_forecast') * 1e6

            pydates = [dt.datetime(1970, 1, 1) + dt.timedelta(seconds=int(d)) for d in date]

            summer = [i for i, d in enumerate(pydates) if d.month in [6, 7, 8, 9] ]
            winter = [i for i, d in enumerate(pydates) if d.month in [11, 12, 1, 2, 3, 4] ]

            diff = ((simulated - obs).mean())
            diffsummer = ((simulated - obs).take(summer).mean())
            diffwinter = ((simulated - obs).take(winter).mean())
            diffstd = ((simulated - obs).std())
            diffsummerstd = ((simulated - obs).take(summer).std())
            diffwinterstd = ((simulated - obs).take(winter).std())

            location = nice_lat(f.site_latitude) + ', ' + nice_lon(f.site_longitude) + ', ' + nice_alt(f.site_elevation)

            if printfmt == 'html':
                ss = (f.site_code.upper(),
                    f.site_code.upper(),
                    f.dataset_project,
                    f.lab_abbr,
                    f.site_country,
                    location,
                    len(np.ma.compressed(mdm)),
                    mdm.mean(),
                    np.sqrt((simulated_std ** 2).mean()),
                    diff, diffstd,
                    diffsummer, diffsummerstd,
                    diffwinter, diffwinterstd)

            table.append(ss)
            f.close()

    if printfmt == 'tex':
        saveas = os.path.join(sumdir, 'site_table.tex')
        f = open(saveas, 'w')
    elif printfmt == 'html':
        saveas = os.path.join(sumdir, 'site_table.html')
        f = open(saveas, 'w')
        txt = "<meta http-equiv='content-type' content='text/html;charset=utf-8' />\n"
        f.write(txt)
        txt = "<table border=1 cellpadding=2 cellspacing=2 width='100%' bgcolor='#EEEEEE'>\n"
        f.write(txt)

    f.write(tablehead)

    for i, ss in enumerate(table):

        f.write(fmt % ss)
        if (i + 1) % 15 == 0:
            f.write(tablehead)

    if printfmt == 'tex':
        f.write('\cline{2-8}\\\\')
        f.write('\hline \\\\')
        f.write('\end{tabular*}')
    else:
        txt = "\n</table>"
        f.write(txt)
    f.close()

    logging.info("File written with summary: %s" % saveas)

def summarize_stats(dacycle):
    """
    Summarize the statistics of the observations for this cycle
    This includes X2 statistics, RMSD, and others for both forecast and
    final fluxes
    """


    sumdir = os.path.join(dacycle['dir.analysis'], 'summary')
    if not os.path.exists(sumdir):
        logging.info("Creating new directory " + sumdir)
        os.makedirs(sumdir)

    # get forecast data from optimizer.ddddd.nc

    startdate = dacycle['time.start']
    dacycle['time.sample.stamp'] = "%s" % (startdate.strftime("%Y%m%d"),)
    infile = os.path.join(dacycle['dir.output'], 'optimizer.%s.nc' % dacycle['time.sample.stamp'])

    if not os.path.exists(infile):
        logging.error("File not found: %s" % infile)
        raise IOError

    f = io.CT_CDF(infile, 'read')
    sites = f.get_variable('sitecode')
    y0 = f.get_variable('observed') * 1e6
    hx = f.get_variable('modelsamplesmean_prior') * 1e6
    dF = f.get_variable('modelsamplesdeviations_prior') * 1e6
    HPHTR = f.get_variable('totalmolefractionvariance').diagonal() * 1e6 * 1e6
    R = f.get_variable('modeldatamismatchvariance').diagonal() * 1e6 * 1e6
    flags = f.get_variable('flag')
    f.close()

    HPHT = dF.dot(np.transpose(dF)).diagonal() / (dF.shape[1] - 1.0)
    rejected = (flags == 2.0)

    sitecodes = [string.join(s.compressed(), '').strip() for s in sites]


    # calculate X2 per observation for this time step

    x2 = []
    for i, site in enumerate(sitecodes):

        x2.append((y0[i] - hx[i]) ** 2 / HPHTR[i])

    x2 = np.ma.masked_where(HPHTR == 0.0, x2)

    # calculate X2 per site
    saveas = os.path.join(sumdir, 'x2_table_%s.html' % dacycle['time.sample.stamp'])
    logging.info("Writing HTML tables for this cycle (%s)" % saveas)
    f = open(saveas, 'w')
    txt = "<meta http-equiv='content-type' content='text/html;charset=utf-8' />\n"
    f.write(txt)
    txt = "<table border=1 cellpadding=2 cellspacing=2 width='100%' bgcolor='#EEEEEE'>\n"
    f.write(txt)
    tablehead = \
          "<TR>\n <TH> Site code </TH> \
               <TH> N<sub>obs</sub> </TH>  \
               <TH> N<sub>rejected</sub> </TH>  \
               <TH> &#8730;R (&mu;mol mol<sup>-1</sup>) </TH>  \
               <TH> &#8730;HPH<sup>T</sup> (&mu;mol mol<sup>-1</sup>) </TH>  \
               <TH> H(x)-y (&mu;mol mol<sup>-1</sup>) </TH> \n \
               <TH> X2 </TH> \n \
           </TR>\n"

    fmt = """<TR> \n \
            <TD>%s</TD>\
            <TD>%d</TD>\
            <TD>%d</TD>\
            <TD>%+5.2f</TD>\
            <TD>%+5.2f</TD>\
            <TD>%+5.2f&plusmn;%5.2f</TD>\
            <TD>%5.2f</TD>\n \
           </TR>\n"""

    f.write(tablehead)

    set_sites = set(sitecodes)
    set_sites = np.sort(list(set_sites))

    for i, site in enumerate(set_sites):
        sel = [i for i, s in enumerate(sitecodes) if s == site]
        ss = (site, len(sel), rejected.take(sel).sum(), np.sqrt(R.take(sel)[0]), np.sqrt(HPHT.take(sel).mean()), (hx - y0).take(sel).mean(), (hx - y0).take(sel).std(), x2.take(sel).mean(),)
        #print site,sel,x2.take(sel)

        f.write(fmt % ss)
        if (i + 1) % 15 == 0:
            f.write(tablehead)

    txt = "\n</table>"
    f.write(txt)
    f.close()

    # Now summarize for each site across time steps

    if not dacycle['time.start'] >= dt.datetime(2008, 12, 29):
        return

    logging.info("Writing HTML tables for each site")
    for site in set_sites:
        saveas = os.path.join(sumdir, '%s_x2.html' % site)
        f = open(saveas, 'w')
        logging.debug(saveas)
        txt = "<meta http-equiv='content-type' content='text/html;charset=utf-8' />\n"
        f.write(txt)
        txt = "<table border=1 cellpadding=2 cellspacing=2 width='100%' bgcolor='#EEEEEE'>\n"
        f.write(txt)
        tablehead = \
              "<TR>\n <TH> From File </TH> \
                   <TH> Site </TH>  \
                   <TH> N<sub>obs</sub> </TH>  \
                   <TH> N<sub>rejected</sub> </TH>  \
                   <TH> &#8730;R (&mu;mol mol<sup>-1</sup>) </TH>  \
                   <TH> &#8730;HPH<sup>T</sup> (&mu;mol mol<sup>-1</sup>) </TH>  \
                   <TH> H(x)-y (&mu;mol mol<sup>-1</sup>) </TH> \n \
                   <TH> X2 </TH> \n \
               </TR>\n"
        f.write(tablehead)

        files = os.listdir(sumdir)
        x2_files = [fil for fil in files if fil.startswith('x2')]
        for htmlfile in x2_files:
            lines = grep(site, os.path.join(sumdir, htmlfile))
            for line in lines:
                f.write('<TR>\n')
                f.write('<TD>' + htmlfile + '</TD>')
                f.write(line + '\n')
                f.write('</TR>\n')

        txt = "\n</table>"
        f.write(txt)
        f.close()


def grep(pattern, fil):
    fileObj = open(fil, 'r')
    r = []
    for line in fileObj:
        if re.search(pattern, line):
            r.append(line)
    return r

# main body if called as script

if __name__ == '__main__':    # started as script
    from da.tools.initexit import CycleControl
    from da.carbondioxide.dasystem import CO2DaSystem

    sys.path.append('../../')

    logging.root.setLevel(logging.DEBUG)

    dacycle = CycleControl(args={'rc':'../../ctdas-od-gfed2-glb6x4-obspack-full.rc'})
    dacycle.initialize()
    dacycle.parse_times()

    dasystem = CO2DaSystem('../rc/carbontracker_ct09_opf.rc')
    dasystem.initialize()

    dacycle.dasystem = dasystem

    q = summarize_obs(dacycle)

    while dacycle['time.start'] < dacycle['time.finish']:
        q = summarize_stats(dacycle)
        dacycle.advance_cycle_times()

    sys.exit(0)