Commit f5988ea4 authored by Roelofsen, Hans's avatar Roelofsen, Hans
Browse files

fix 4 reading new req species list

parent bb987536
......@@ -20,8 +20,8 @@ cv_precp_dir = r'w:\PROJECTS\Doren19\a_brondata\covariables\EObs\2_compiled'
cv_precp_src = "EObs_v200e_rr_5yrmean"
cv_temp_dir = cv_precp_dir
cv_temp_src = "EObs_v200e_tg_5yrmean"
sp_req_src = r'c:\Users\roelo008\Wageningen University & Research\DOREN - General\soortenlijsten\soortenlijst te draaien 30-11-2020.xlsx'
sp_req_sheet, column = 'soortenlijst opgeschoond', 'Hans_EU_schoon'
sp_req_src = r'c:\Users\roelo008\Wageningen University & Research\DOREN - General\DOREN-2020-11-30.xlsx'
sp_req_sheet, column, skip = 'Soorten', 'wetenschappelijke_naam', 0
# sp_req_src = r'c:\Users\roelo008\Wageningen University & Research\DOREN - General\2020-09-17 uniek soorten per habitat (2).xlsx'
testing = False
......@@ -30,7 +30,7 @@ doren = dc.Doren(header_src=param_header_src, sp_src=param_sp_src)
doren.initiate(sample=testing)
doren.apply_requirements('req1', 'req2', 'req3', 'req4', 'req8', 'req9', 'req10',
aoi_src=None if testing else param_aoi_src, dem_src=param_dem_src)
doren.get_requested_species(xls=sp_req_src, sheet=sp_req_sheet, col=column, simplify_names=False)
doren.get_requested_species(xls=sp_req_src, sheet=sp_req_sheet, col=column, skip=skip, simplify_names=False)
doren.add_covar(covar_dir=cv_cntr_dir, covar_src=cv_cntr_src, covar_name='country', raster=False, column='SOV_A3')
doren.add_covar(covar_dir=cv_soil_dir, covar_src=cv_soil_src, covar_name='soil_type', nominal=True)
doren.add_yearly_covar(covar_dir=cv_precp_dir, covar_src_basename=cv_precp_src, covar_name='five_yearly_precip')
......@@ -50,10 +50,8 @@ doren.write_stuff('plot_covars_file')
doren.write_stuff('species_single_file')
'''
doren.write_stuff('species_list')
doren.write_stuff('typische_soorten')
doren.write_stuff('plot_covars_file')
doren.write_stuff('report')
# doren.write_stuff('headers_shp')
with open(r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\b_compiled_data\a_pkl\doren_{}.pkl'.
format(doren.timestamp), 'wb') as handle:
......
......@@ -20,4 +20,4 @@ echo $SLURM_ARRAY_TASK_ID
# Run
cd /home/WUR/roelo008/projs/doren_2019
python run_species.py $SLURM_ARRAY_TASK_ID 50 doren_20201130.pkl
\ No newline at end of file
python run_species.py $SLURM_ARRAY_TASK_ID 50 doren_20201201.pkl
\ No newline at end of file
......@@ -18,7 +18,7 @@ n_batches = int(args.n_batches)
# Recover the Pickled doren object
pkl_src = os.path.join('./pkl_src/', args.pkl)
# pkl_src = r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\b_compiled_data\a_pkl\doren_20201121.pkl'
# pkl_src = r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\b_compiled_data\a_pkl\doren_20201130.pkl'
with open(pkl_src, 'rb') as handle:
doren = pickle.load(handle)
......@@ -28,6 +28,10 @@ doren.base_out_dir = r'./c_out'
# sp_req_src = r'c:\Users\roelo008\Wageningen University & Research\DOREN - General\2020-09-17 uniek soorten per habitat (2).xlsx'
# doren.get_requested_species(xls=sp_req_src, sheet='Verdringingssoorten', col='wetenschappelijke naam', simplify_names=False)
if batch_nr == 1:
# Write the covar file once
doren.write_stuff('plot_covars_file')
# Determine subset of requested species to proces
sel = np.array_split(np.arange(0, len(doren.req_sp)), n_batches)[batch_nr].tolist()
print('Commencing processing of {0} species...'.format(len(sel)))
......@@ -42,9 +46,6 @@ for i, j in enumerate(sel, start=1):
doren.nearest_positive_queryd(col='structuurtype', val=structuurtype)
doren.get_bedekking_selected_sp()
doren.write_stuff('species_single_file')
if batch_nr == n_batches and i == len(sel):
# Write the covar file once
doren.write_stuff('plot_covars_file')
except OSError:
continue
......
......@@ -107,13 +107,16 @@ class Doren:
usecols=['PlotObservationID', 'TV2 relevé number', 'Country','Longitude', 'Latitude',
'Date of recording', 'Dataset', 'EUNIS_Old', 'EUNIS_New'])
self.report += 'Starting @ {0} with {1} EVA headers containing {2} unique ' \
'species.\n\n'.format(self.timestamp_full, eva.shape[0], len(self.species))
try:
eva.set_index('PlotObservationID', drop=False, verify_integrity=True, inplace=True)
except ValueError:
pre = eva.shape[0]
eva.drop_duplicates(subset='PlotObservationID', inplace=True)
post = eva.shape[0]
self.report += 'Dropping duplicate PlotObservationsIDs going from {0} to {1}\n'.format(pre, post)
self.report += 'Dropping duplicate PlotObservationsIDs going from {0} to {1}\n\n'.format(pre, post)
eva.set_index('PlotObservationID', drop=False, verify_integrity=True, inplace=True)
eva.rename(columns=dict(zip(do.eva_colnames_orig(), do.eva_colnames_new())), inplace=True)
......@@ -141,12 +144,17 @@ class Doren:
del eva
self.update_status()
self.report += 'Starting @ {0} with {1} EVA headers containing {2} unique ' \
'species.\n\n'.format(self.timestamp_full, self.status['n_plots'], self.status['n_species'])
# if duplicates:
# self.report += 'Removed {0} duplicates in plot_id, going from {1} to {2} headers'.format(diff, pre, post)
def get_requested_species(self, xls, sheet, col, simplify_names=False):
'''
Note: Integer kolommen kunnen geen NAs bevatten. Dus procedure zou moeten zijn:
1. inlezen
2. selecties en nieuwe kolommen
3. kolommen parsen naar gewenste datatype
'''
def get_requested_species(self, xls, sheet, col, skip, simplify_names=False):
"""
read list of species requested for processing
:param xls: file path to Excel sheet
......@@ -158,7 +166,7 @@ class Doren:
"""
# Read csv file with requested species and drop empty rows
reqs = pd.read_excel(xls, sheet_name=sheet)
reqs = pd.read_excel(xls, sheet_name=sheet, skiprows=skip)
reqs.dropna(subset=[col], inplace=True, axis=0)
# Optionally simplify species names
......@@ -167,7 +175,7 @@ class Doren:
reqs.loc[:, 'species_name_hdr'] = reqs.species_name_hdr.astype(str).apply(do.simplify_species)
# Indicate if species names are matched in EVA database
reqs.loc[:, 'species_in_EVA'] = reqs.loc[:, col].apply(lambda x: x in self.species)
# reqs.loc[:, 'species_in_EVA'] = reqs.loc[:, col].apply(lambda x: x in self.species)
# NOTE: self.species may change as a result of adding covariables!
# Attach as df
......@@ -176,7 +184,7 @@ class Doren:
# Update attributes
req_sp_set = set(self.request_df.loc[:, col])
self.req_sp = list(req_sp_set)
self.req_found = self.species.intersection(req_sp_set)
self.req_found = self.species.intersection(req_sp_set) # self.species is set van eva simplified matched_concept
self.req_not_found = req_sp_set.difference(self.species)
# List alternatives by looking for not-found-species in the 'turboveg2_concept' column instead of
......@@ -334,28 +342,28 @@ class Doren:
nox_v.rename(columns=dict(zip(yr_cols, ['year_{0}'.format(y) for y in yr_cols])), inplace=True)
# NH3 - Forest to all plots
mapper = self.eva.assign(year=('year_' + self.eva.date_of_recording.astype(str)), plot_obs_id=self.eva.index)
mapper = self.eva.assign(year=('year_' + self.eva.date_of_recording.astype(int).astype(str)), plot_obs_id=self.eva.index)
c1 = mapper['plot_obs_id'].isin(nh3_f.index)
c2 = mapper['year'].isin(nh3_f.columns)
mapper = mapper.loc[c1 & c2]
self.eva.loc[c1 & c2, 'nh3_mg_m2_f'] = nh3_f.lookup(mapper['plot_obs_id'], mapper['year'])
# NH3 - Open to all plots
mapper = self.eva.assign(year=('year_' + self.eva.date_of_recording.astype(str)), plot_obs_id=self.eva.index)
mapper = self.eva.assign(year=('year_' + self.eva.date_of_recording.astype(int).astype(str)), plot_obs_id=self.eva.index)
c1 = mapper['plot_obs_id'].isin(nh3_v.index)
c2 = mapper['year'].isin(nh3_v.columns)
mapper = mapper.loc[c1 & c2]
self.eva.loc[c1 & c2, 'nh3_mg_m2_v'] = nh3_v.lookup(mapper['plot_obs_id'], mapper['year'])
# NOx - Forest to all plots
mapper = self.eva.assign(year=('year_' + self.eva.date_of_recording.astype(str)), plot_obs_id=self.eva.index)
mapper = self.eva.assign(year=('year_' + self.eva.date_of_recording.astype(int).astype(str)), plot_obs_id=self.eva.index)
c1 = mapper['plot_obs_id'].isin(nox_f.index)
c2 = mapper['year'].isin(nox_f.columns)
mapper = mapper.loc[c1 & c2]
self.eva.loc[c1 & c2, 'nox_mg_m2_f'] = nox_f.lookup(mapper['plot_obs_id'], mapper['year'])
# NOx - Open to all plots
mapper = self.eva.assign(year=('year_' + self.eva.date_of_recording.astype(str)), plot_obs_id=self.eva.index)
mapper = self.eva.assign(year=('year_' + self.eva.date_of_recording.astype(int).astype(str)), plot_obs_id=self.eva.index)
c1 = mapper['plot_obs_id'].isin(nox_v.index)
c2 = mapper['year'].isin(nox_v.columns)
mapper = mapper.loc[c1 & c2]
......@@ -364,12 +372,12 @@ class Doren:
# Total NDep Forest plots
self.eva['totN_mg_m2_f'] = self.eva.loc[:, ["nh3_mg_m2_f", "nox_mg_m2_f"]].sum(axis=1)
self.eva['totN_kg_ha_f'] = self.eva.loc[:, 'totN_mg_m2_f'].divide(100)
self.eva['tot_N_kmol_ha_f'] = self.eva.loc[:, 'totN_kg_ha_f'].divide(14)
self.eva['totN_kmol_ha_f'] = self.eva.loc[:, 'totN_kg_ha_f'].divide(14)
# Total NDep Open plots
self.eva['totN_mg_m2_v'] = self.eva.loc[:, ["nh3_mg_m2_v", "nox_mg_m2_v"]].sum(axis=1)
self.eva['totN_kg_ha_v'] = self.eva.loc[:, 'totN_mg_m2_v'].divide(100)
self.eva['tot_N_kmol_ha_v'] = self.eva.loc[:, 'totN_kg_ha_v'].divide(14)
self.eva['totN_kmol_ha_v'] = self.eva.loc[:, 'totN_kg_ha_v'].divide(14)
# Total NDep differentiated according to plot Open/Forest classification
if hasattr(self.eva, 'hooglaag'):
......@@ -377,18 +385,21 @@ class Doren:
v = self.eva.loc[self.eva.hooglaag == 'laag'].index # Open vegetation
x = self.eva.index.difference(f.union(v)) # Anders/niet bekend
self.eva.loc[f, 'tot_N_kmol_ha'] = self.eva.loc[f, 'tot_N_kmol_ha_f']
self.eva.loc[v, 'tot_N_kmol_ha'] = self.eva.loc[v, 'tot_N_kmol_ha_v']
self.eva.loc[x, 'tot_N_kmol_ha'] = self.eva.loc[x, 'tot_N_kmol_ha_v'] # Gebruik Open Veg waneer onduidelijk
self.eva.loc[f, 'totN_kmol_ha'] = self.eva.loc[f, 'totN_kmol_ha_f']
self.eva.loc[v, 'totN_kmol_ha'] = self.eva.loc[v, 'totN_kmol_ha_v']
self.eva.loc[x, 'totN_kmol_ha'] = self.eva.loc[x, 'totN_kmol_ha_v'] # Gebruik Open Veg waneer onduidelijk
else:
print('Cannot assign NDep data differenntiated to structuurtype. ')
raise Exception('Cannot assign NDep data differenntiated to structuurtype. ')
# Drop NAs
self.eva.dropna(axis=0, subset=['tot_N_kmol_ha'], how='any', inplace=True)
self.eva.dropna(axis=0, subset=['totN_kmol_ha'], how='any', inplace=True)
self.update_status(covar=['tot_N_kmol_ha'])
self.report += 'Added NDep from POSCH: {0} rows remaining.\n'.format(self.eva.shape[0])
self.update_status(covar=['totN_kmol_ha'])
msg = '\nAdded NDep from POSCH: {0} rows remaining.\n'.format(self.eva.shape[0])
self.report += msg
if self.verbose:
print(msg)
def add_covar(self, covar_dir, covar_src, covar_name, nominal=False, keep_all=False, raster=True, **kwargs):
"""
......@@ -511,15 +522,28 @@ class Doren:
if len(missing) > 0:
msg = 'EUNIS codes w/o doorvertaling naar vegtype:\n'
msg2 = '\n'.join('{0}: {1}'.format(a, b) for a, b in enumerate(list(missing)))
self.report += msg + msg2
self.report += msg + msg2 + '\n'
# Map structuurtype and OpenBos columns
# Where to use old and new EUNIS?
use_new_indx = self.eva.loc[self.eva.eunis_src == 'eunis_new'].index
use_old_indx = self.eva.loc[self.eva.eunis_src == 'eunis_old'].index
# Map structuurtype and OpenBos columns
self.eva.loc[use_new_indx, 'structuurtype'] = self.eva.loc[use_new_indx, 'eunis_code'].map(eunis_new2structuur)
self.eva.loc[use_old_indx, 'structuurtype'] = self.eva.loc[use_old_indx, 'eunis_code'].map(eunis_old2structuur)
self.eva.loc[use_new_indx, 'hooglaag'] = self.eva.loc[use_new_indx, 'eunis_code'].map(eunis_new2bos)
self.eva.loc[use_old_indx, 'hooglaag'] = self.eva.loc[use_old_indx, 'eunis_code'].map(eunis_old2bos)
'''
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html
pd.series.map(dict) geeft geen KeyError als de Key niet in de dict zit, maar geeft NAN Dit is overduidelijk
het geval vooor eunis_new2structuur['A25c']. Dit EUNIS type heeft WW **niet** doorvertaald in de EUNIS_new
Excel maar wordt **wel** genoemd als EUNIS_NEW. Dit resulteert in een NaN die daarna wordt opgevuld met "?"
Update: A25c volgens de oude typologie is niet equivalent aan de nieuwe typologie. Ie code is hetzelfde,
inhoud niet
'''
self.eva.structuurtype.fillna('?', inplace=True)
self.eva.hooglaag.fillna('?', inplace=True)
......@@ -707,7 +731,7 @@ class Doren:
report_name = '{0}_{1}_processing_report.txt'.format(self.basename, self.timestamp)
self.report += 'Written processing report to file: {0}\n\n'.format(os.path.join(out_dir, report_name))
header = 'Processing report for {0} created {1}'.format(self.basename, self.timestamp)
header = 'Processing report for {0} created {1}\n\n'.format(self.basename, self.timestamp)
footer = '\nMade with Python 3.5 using Pandas by Hans Roelofsen, WEnR team B&B.'
source = '\nSee git for source script: https://git.wur.nl/roelo008/doren_2019.'
with open(os.path.join(out_dir, report_name), 'w') as f:
......@@ -837,7 +861,7 @@ class Doren:
"""
# check if all required data is present
if 'tot_N_kmol_ha' not in self.eva.columns:
if 'totN_kmol_ha' not in self.eva.columns:
print('Cannot report plot covars file')
return None
......@@ -847,12 +871,12 @@ class Doren:
if not os.path.isdir(pg_dir):
os.mkdir(pg_dir)
out_cols = ['plot_obs_id', 'tot_N_kmol_ha', 'soil_type_label', 'country_label', 'five_yearly_precip',
'five_yearly_temp', 'eunis_code', 'structuurtype', 'hooglaag']
out_cols = ['plot_obs_id', 'totN_kmol_ha', 'soil_type_label', 'country_label', 'five_yearly_precip',
'five_yearly_temp', 'eunis_code', 'eunis_src', 'structuurtype', 'hooglaag']
for col in out_cols:
assert hasattr(self.eva, col), 'Cannot write due to missing column {}'.format(col)
out_df = self.eva.loc[:, out_cols]
out_df.round({'plot_obs_id': 0, 'tot_N_kmol_ha': 2, 'five_yearly_precip': 2, 'five_yearly_temp': 2, })\
out_df.round({'plot_obs_id': 0, 'totN_kmol_ha': 2, 'five_yearly_precip': 2, 'five_yearly_temp': 2, })\
.to_csv(os.path.join(out_dir, '{}.csv'.format(pg_out_name)), sep=',', index=False, header=True)
self.report += 'Written plot covar output file: {0}'.format(os.path.join(pg_dir, pg_out_name))
......@@ -1004,5 +1028,9 @@ class Doren:
self.report += 'Sommige EUNIS typen vertalen door naar > 1 structuurtype\n'
self.report += sel.to_csv(sep='\t')
# Check all NDep > 0
ndeps = [col for col in dir(self.eva) if col.startswith('totN')]
if all(self.eva.loc[:, ndeps] > 0):
self.report += 'Alle NDep gegevens > 0\n'
else:
self.report += '!!! Sommige NDep gegevens <= 0\n'
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment