Again improved analysis

bca41cf6 · Roelofsen, Hans · 22325d0a · bca41cf6 · bca41cf6
Commit bca41cf6 authored 5 years ago by Roelofsen, Hans
--- a/b_analysis/clo_1543_analysis.py
+++ b/b_analysis/clo_1543_analysis.py
@@ -33,9 +33,13 @@ sub_soortlijst = ['SNL']  # SNL, Bijlage 1 of EcoSysLijst?
 snl_gebieden_wel = ['N1900']  # 250m cellen met welk SNL type?
 snl_gebieden_niet = []  # 250m cellen met welk SNL type?

-# dif_cats = [range(-1000, -1), range(-1, 2), range(2, 1000)]
+# Definieer de numerieke range voor de categorieren Afname, Stabiel, Toename:
+# [a, b, c, d]
+# a <= Afname < b
+# b <= Stabiel < c
+# c <= Toename < d
 dif_cats = [-1000, -1, 2, 1000]  # see https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html
-dif_labs = ['afname', 'stabiel', 'toename']
+dif_labs = ['afname', 'stabiel', 'toename']  # must be in INCREASING order

 # choose one of ['1994-2001', '2002-2009', '2010-2017'] for periode A and periode B, where stats will be made A-B
 periodes = ['1994-2001', '2002-2009', '2010-2017']
@@ -72,44 +76,31 @@ vogel_trend, vogel_score = clo.calc_clo_1543(s_periode_A=dat_piv.loc[:, idx['vog
 vlinder_trend, vlinder_score = clo.calc_clo_1543(s_periode_A=dat_piv.loc[:, idx['vlinder', periode_A]],
                                                 s_periode_B=dat_piv.loc[:, idx['vlinder', periode_B]],
                                                 bins=dif_cats, labels=dif_labs)
-'''
-Create dataframe where all trends and scores are in a single column, plus `soortgroep` column etc
-'''
-# all trends in a series
-trend_s = pd.concat([plant_trend, vogel_trend, vlinder_trend], axis=0, ignore_index=True)
-trend_s.name = 'clo1543_trend'

-# all scores in a series
-score_s = pd.concat([plant_score, vogel_score, vlinder_score], axis=0, ignore_index=True)
-score_s.name = 'clo1543_score'
+'''Create new df containing scores and trends for all soortgroepen. Then attach to hokken.'''
+foo_df = pd.concat([plant_trend, plant_score, vogel_trend, vogel_score, vlinder_trend, vlinder_score],
+                   axis=1)
+clo_df = pd.merge(left=hokken, right=foo_df, how='left', right_index=True, left_index=True)

-# series with species-group names
-species_s = pd.Series(['vaatplant']*dat_piv.shape[0] + ['vogel']*dat_piv.shape[0] + ['vlinder']*dat_piv.shape[0],
-                      name='soortgroep')
+'''Niet alle hokken zullen een PGO observatie hebben. Vul score kolommen in met `onbekend`.'''
+clo_df.filter(regex='*score*').fillna(value='onbekend', axis=1)

-# series hok_ids (note repetition)
-hok_id_s = pd.concat([plant_score.index.to_series(), vogel_score.index.to_series(), vlinder_score.index.to_series()],
-                     axis=0, ignore_index=True)
-hok_id_s.name = 'hok_id'
+'''
+Bereken totaal areaal per soortgroep voor CLO scores Toename, Stabiel, Afname, Onbekend.
+Bereken ook netto toename (Toename-Afname) en score als netto/totaal areal
+'''
+plant_out = pd.pivot_table(clo_df, index='vaatplant_score', values='areaal_ha', aggfunc='sum')
+plant_out.loc[:, 'netto'] = plant_out.apply(lambda row: row.toename - row.afname, axis=1)
+plant_out.loc[:, 'score'] = plant_out.apply(lambda row: row.netto / hokken.areaal_ha.sum() * 100, axis=1)

-# concat the series into a dataframe and merge with hokken. Retain only hokken (how=right)
-left_df = pd.concat([trend_s, score_s, species_s, hok_id_s], axis=1)
-clo_dat = pd.merge(left=left_df, right=hokken.loc[:, ['hok_id', 'areaal_ha', 'twente']],  # hokken only!
-                   left_on='hok_id', right_on='hok_id', how='right')
+vogel_out = pd.pivot_table(clo_df, index='vogel_score', values='areaal_ha', aggfunc='sum')
+vogel_out.loc[:, 'netto'] = vogel_out.apply(lambda row: row.toename - row.afname, axis=1)
+vogel_out.loc[:, 'score'] = vogel_out.apply(lambda row: row.netto / hokken.areaal_ha.sum() * 100, axis=1)

-''' 
-Fill CLO scores with 'onbekend' for cellen in Twente
-'''
-clo_dat.fillna('onbekend', inplace=True, axis=1)
-clo_dat.loc[clo_dat.twente > 0, ['clo1543_trend', 'clo1543_score']] = 'onbekend'
+vlinder_out = pd.pivot_table(clo_df, index='vlinder_score', values='areaal_ha', aggfunc='sum')
+vlinder_out.loc[:, 'netto'] = vlinder_out.apply(lambda row: row.toename - row.afname, axis=1)
+vlinder_out.loc[:, 'score'] = vlinder_out.apply(lambda row: row.netto / hokken.areaal_ha.sum() * 100, axis=1)

-'''
-Pivot on CLO1543 scores and calculate total area per score for each soortgroep. 
-'''
-piv_out = pd.pivot_table(clo_dat, index='soortgroep', columns='clo1543_score', values='areaal_ha', aggfunc='sum')
-piv_out.loc[:, 'netto'] = piv_out.apply(lambda row: row.toename - row.afname, axis=1)
-piv_out.loc[:, 'score'] = piv_out.apply(lambda row: row.netto / hokken.areaal_ha.sum() * 100, axis=1)
-piv_out.to_clipboard(sep=';')

 '''
 Write report
@@ -118,18 +109,30 @@ timestamp = datetime.datetime.now().strftime("%y%m%d-%H%M")
 out_dir = r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\CLO1543'
 basename = 'clo1543_{0}_{1}'.format(id, timestamp)
 pgo_dat_summary = pd.pivot_table(pgo_dat, index='soortgroep', columns=['snl', 'periode'], values='n', aggfunc='count')
+cat_lims = [(dif_cats[i-1], dif_cats[i]) for i in range(1, len(dif_cats))]
+lo_lims = [l for (l,_) in cat_lims]
+up_lims = [u for (_,u) in cat_lims]
+# TODO: hier nog een mooie string van maken!
 # category_limits = ['{0} t/m {1}'.format(min(x), max(x)) for x in dif_cats]
 # categories = ', '.join('{0}: {1}'.format(k, v) for k,v in dict(zip(dif_labs, category_limits)).items())
-header = 'Extract from PGO species distribution data as follows:\n' \
-         'Soortgroepen: {0}\n' \
-         'Soortlijst: {1}\n' \
-         'Sub-soortlijst: {2}\n' \
-         'PGO data restricted to {3} 250m cells where: {4}\n' \
-         'Trends: {5}\n' \
-         'Trends berekend als # soorten in {6} minus {7} \n\n'.format(', '.join([soort for soort in soortgroepen]),
-                                                                      ', '.join(snl for snl in snl_soortlijst),
-                                                                      '-'.join(sub for sub in sub_soortlijst),
-                                                                      dat_piv.shape[0], spat_query, dif_cats,
+clo_scores =
+header = '#Model run dated: {0} by {1}\n#\n' \
+         '#Extract from PGO species distribution data with PGO Query:\n' \
+         '#  Soortgroepen: {2}\n' \
+         '#  SNL Soortlijst: {3}\n' \
+         '#  SNL Sub-soortlijst: {4}\n' \
+         '#  ==> {5} observations in {6} different 250m hokken (see also PGO DATA SUMMARY)\n' \
+         '#Selection from 250m hokken grid with Beheertypen Query:\n' \
+         '#  {7}\n' \
+         '#  ==> {8} 250m hokken with total {9} hectare.\n' \
+         '#Trend refers to species difference between {10}-{11}}\n' \
+         '#Scores are as follows:\n' \
+         ''
+    .format(timestamp, os.environ.get('USERNAME'), ', '.join([soort for soort in soortgroepen]),
+            ', '.join(snl for snl in snl_soortlijst), '-'.join(sub for sub in sub_soortlijst),
+            pgo_dat.shape[0], dat_piv.shape[0], spat_query, hokken.shape[0], hokken.areaal_ha.sum(),
+            periode_A, periode_A,
+            dif_cats,
                                                                      periode_A, periode_B)
 footer = '\nMade with Python 3.5 using pandas, geopandas, by Hans Roelofsen, WEnR team B&B, dated {0}'.format(timestamp)

@@ -138,7 +141,12 @@ with open(os.path.join(out_dir, basename + '.txt'), 'w') as f:
    f.write('\n###PGO DATA SUMMARY###\n')
    f.write(pgo_dat_summary.to_csv(sep='\t', line_terminator='\r'))
    f.write('\n##### HECTARE-TOENAME-STABIEL-AFNAME #####\n')
-    f.write(piv_out.to_csv(sep='\t', line_terminator='\r'))
+    f.write('\n\n')
+    f.write(plant_out.to_csv(sep='\t', line_terminator='\r'))
+    f.write('\n\n')
+    f.write(vogel_out.to_csv(sep='\t', line_terminator='\r'))
+    f.write('\n\n')
+    f.write(vlinder_out.to_csv(sep='\t', line_terminator='\r'))
    f.write(footer)



--- a/z_utils/clo.py
+++ b/z_utils/clo.py
@@ -233,9 +233,9 @@ def calc_clo_1543(s_periode_A, s_periode_B, bins, labels):

    try:
        sp_diff = s_periode_A.sub(s_periode_B)
-        sp_diff.name = 'clo1543_trend_{0}'.format(s_periode_A.name)
+        sp_diff.name = 'rend_{0}'.format(s_periode_A.name)
        sp_diff_score =sp_diff.apply(classifier, bins=bins, labels=labels)
-        sp_diff_score.name = 'clo1543_score_{0}'.format(s_periode_A.name)
+        sp_diff_score.name = 'score_{0}'.format(s_periode_A.name)

        return sp_diff, sp_diff_score