Commit 1d3080b4 authored by Roelofsen, Hans's avatar Roelofsen, Hans
Browse files

manage NoData in rasterstats point query. Self testing method

parent e5390cc5
import os
import pickle
from utils import doren_classes as dc
param_header_src = r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\a_brondata\EVA\EVA_Doren_header.csv'
param_sp_src = r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\a_brondata\EVA\EVA_Doren_species.csv'
param_aoi_src = r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\geodata\AOI\ne_50m_cntrs_AOI_diss_fin.shp'
......@@ -25,10 +25,10 @@ doren.apply_requirements('req1', 'req2', 'req3', 'req4', 'req8', 'req9', 'req10'
doren.add_covar(covar_dir=cv_cntr_dir, covar_src=cv_cntr_src, covar_name='country', raster=False, column='SOVEREIGNT')
doren.add_posch(posch_src_dir=param_posch)
doren.add_covar(covar_dir=cv_soil_dir, covar_src=cv_soil_src, covar_name='soil_type', nominal=True)
doren.add_yearly_covar(covar_dir=cv_precp_dir, covar_src_basename=cv_precp_src, covar_name='5_yearly_precip')
doren.add_yearly_covar(covar_dir=cv_temp_dir, covar_src_basename=cv_temp_src, covar_name='5_yearly_temp')
doren.add_yearly_covar(covar_dir=cv_precp_dir, covar_src_basename=cv_precp_src, covar_name='five_yearly_precip')
doren.add_yearly_covar(covar_dir=cv_temp_dir, covar_src_basename=cv_temp_src, covar_name='five_yearly_temp')
doren.get_requested_species(xls=sp_req_src, sheet='PGO-1', col='newsoort', simplify_names=False)
doren.test()
doren.write_stuff('species_list')
doren.write_stuff('typische_soorten')
......@@ -39,4 +39,16 @@ with open(r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\b_compile
format(doren.timestamp), 'wb') as handle:
pickle.dump(doren, handle, protocol=pickle.HIGHEST_PROTOCOL)
'''
pre = dc.Doren(header_src=param_header_src, sp_src=param_sp_src)
pre.initiate(sample=False, species_col='turboveg2_concept')
pre.select_plts_w_species('Sparganium natans')
print(pre.eva.shape)
print(len(pre.positive_plots))
post = dc.Doren(header_src=param_header_src, sp_src=param_sp_src)
post.initiate(sample=False, species_col='matched_concept')
post.select_plts_w_species('Sparganium natans')
print(post.eva.shape)
print(len(post.positive_plots))
'''
\ No newline at end of file
......@@ -20,4 +20,4 @@ echo $SLURM_ARRAY_TASK_ID
# Run
cd /home/WUR/roelo008/projs/doren_2019
python run_species.py $SLURM_ARRAY_TASK_ID 50 doren_20200928.pkl
\ No newline at end of file
python run_species.py $SLURM_ARRAY_TASK_ID 50 doren_20201001.pkl
\ No newline at end of file
......@@ -324,7 +324,8 @@ def get_raster_vals(coords, rast_src, nominal=False):
coords = coords.apply(shapely.geometry.Point)
# the numeric raster values
rast_vals = rast.point_query(coords, raster.read(1), interpolate=interpolation, affine=raster.affine)
rast_vals = rast.point_query(coords, raster.read(1), interpolate=interpolation, affine=raster.affine,
nodata=raster.nodata)
return pd.DataFrame(data={'vals': rast_vals}, index=coords.index)
......@@ -370,5 +371,17 @@ sp12 = 'Rosa spinosissima mod HANS'
sps = [sp1,sp2,sp3,sp4,sp5,sp6,sp7,sp8,sp9,sp10,sp11,sp12]
[re.split(pattern, x) for x in sps]
pattern = re.compile(r'[A-Z]{1}[a-z]{1,} [a-z]{1,}')
'''
import shapely
import geopandas as gp
cities = cities.loc[cities.name.str.isin(['Vatican City', 'Amsterdam', 'Athens', 'Bern', 'Paris', 'Berlin', 'Rome'])]
cities = cities.loc[cities.name.isin(['Vatican City', 'Amsterdam', 'Athens', 'Bern', 'Paris', 'Berlin', 'Rome'])]
gdf2 = gp.GeoDataFrame(data={'name': ['NorthSea', 'Coast'], 'geometry':[shapely.geometry.Point(3, 53), shapely.geometry.Point(1.653, 50.819)]},
index=[1,2])
cities = pd.concat([cities, gdf2])
rast_vals = rast.point_query(cities.geometry, raster.read(1), interpolate='bilinear', affine=raster.affine, nodata=raster.nodata)
print('\n'.join([str(x) for x in rast_vals]))
out = pd.DataFrame(data={'vals': rast_vals}, index=cities.index)
'''
\ No newline at end of file
......@@ -63,7 +63,7 @@ class Doren:
self.timestamp = datetime.datetime.now().strftime("%Y%m%d")
self.timestamp_full = datetime.datetime.now().strftime("%Y%m%d%H%M")
def initiate(self, sample=False):
def initiate(self, sample=False, species_col='matched_concept'):
"""
Read source data, sanitize column names and fix some datatypes
:return: self.eva and self.species populated with (geo)dataframes
......@@ -74,7 +74,7 @@ class Doren:
colnames_n = ['plot_obs_id', 'taxonomy', 'taxon_group', 'taxon_group_id', 'turboveg2_concept',
'matched_concept', 'match', 'layer', 'cover_perc', 'cover_code']
species.rename(columns=dict(zip(list(species), colnames_n)), inplace=True)
species['species_name_hdr'] = species.matched_concept.astype(str).apply(do.strip_leading_quote)
species['species_name_hdr'] = species.loc[:, species_col].astype(str).apply(do.strip_leading_quote)
species['species_name_hdr'] = species.species_name_hdr.apply(do.simplify_species)
self.spec = species
......@@ -162,8 +162,10 @@ class Doren:
container.append(dat)
self.req_not_found_df = pd.concat(container)
msg1 = '\nRead {0} species requested for processing, of which {1} match a species in EVA ("matched_concept")and {2} do ' \
'not (matching "turboveg2_concepts" given in brackets: \n'.format(len(self.req_sp), len(self.req_found), len(self.req_not_found))
msg1 = '\nRead {0} species requested for processing (source: {1}-sheet {2}-column {3}), ' \
'of which {3} match a species in EVA ("matched_concept") and {4} do not (but matching ' \
'"turboveg2_concepts" given in brackets): \n'\
.format(len(self.req_sp), os.path.abspath(xls), sheet, col, len(self.req_found), len(self.req_not_found))
msg2 = self.req_not_found_df.sort_index().to_csv(sep='\t')
self.report += msg1 + msg2 + '\n'
if self.verbose:
......@@ -471,7 +473,7 @@ class Doren:
column 'nearestID' in self.eva with ID of nearest positive plot
"""
ids, dist = do.ckdnearest(gdA=self.eva.loc[self.negative_plots, :] ,gdB=self.eva.loc[self.positive_plots])
ids, dist = do.ckdnearest(gdA=self.eva.loc[self.negative_plots, :], gdB=self.eva.loc[self.positive_plots])
self.eva.loc[self.negative_plots, 'dist2nearest'] = dist
self.eva.loc[self.positive_plots, 'dist2nearest'] = 0
......@@ -731,3 +733,46 @@ class Doren:
self.buffer_gdf = None
self.nearby_plots = None
self.buffer_size = None
def test(self):
"""
Test if all contents of DOREN are as expected
:return:
"""
self.report += 'Testing dataset integrity\n'
# Check for NAs
for check_col in self.status['covars'] + ['plot_id', 'country', 'year', 'veg_type', 'plot_coordinates_wgs84']:
nas = self.eva.loc[:, check_col].isna()
if any(nas):
msg = '{0} NA values found for {1} (index: {2})\n'.format(len(nas), check_col, self.eva.loc[nas].index)
else:
msg = '{0} -- no missing data\n'.format(check_col)
self.report += msg
# Check for invalid vals
earliest_yr, latest_yr = self.eva.year.min(), self.eva.year.max()
self.report += 'Plot years are between {0}-{1}\n'.format(earliest_yr, latest_yr)
for check_col in [x for x in self.status['covars'] if not x.endswith('precip')]:
sub_zeros = self.eva.loc[:, check_col] < 0
if any(sub_zeros):
msg = '{0} NA values found for {1} (index: {2})\n'.format(len(sub_zeros), check_col,
self.eva.loc[sub_zeros].index)
else:
msg = '{0} -- all values > 0\n'
self.report += msg
if self.eva.index.difference(set(self.spec.plot_obs_id)).empty:
msg = 'Full match between header- and species database plot IDs\n'
else:
eva_plots = self.eva.shape
spec_plots = len(set(self.spec.plot_obs_id))
msg = 'Difference found between EVA plot IDs {0} and plot IDs in species database {1}\n'.format(eva_plots,
spec_plots)
self.report += msg
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment