Commit 9312f63a authored by roelo008's avatar roelo008
Browse files

function to verify species name

update to re pattern strippping subspecies info
attaching requested species as df
parent 158c703e
......@@ -2,5 +2,6 @@
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
<mapping directory="$PROJECT_DIR$/doren_2019" vcs="Git" />
</component>
</project>
\ No newline at end of file
......@@ -23,6 +23,10 @@ doren.add_yearly_covar(covar_dir=r'c:\Users\roelo008\OneDrive - WageningenUR\a_p
covar_src_basename="EObs_v200e_tg_5yrmean", covar_name='5_yearly_temp')
doren.get_requested_species(src=r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\z_scratch\soortenlijst.csv')
doren.write_stuff('species_list')
doren.write_stuff('typische_soorten')
doren.write_stuff('report')
with open(r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\b_compiled_data\a_pkl\doren_{}.pkl'.
format(doren.timestamp), 'wb') as handle:
format(doren.timestamp), 'wb') as handle:
pickle.dump(doren, handle, protocol=pickle.HIGHEST_PROTOCOL)
......@@ -166,7 +166,7 @@ def simplify_species(species_name):
:param species_name:
:return: species_name minus subsp or other flags
"""
pattern = re.compile(r's.l.$| subsp.? | var.? | aggr.? | ""| ssp.? ?| s. ?| mod.? ?| \(')
pattern = re.compile(r's.l.$| subsp.? | var.? | aggr.? | ""| ssp.? | s.? | mod.? | \(| aggr\.?$| sensu\.? ')
if re.search(pattern, species_name):
out = re.split(pattern, species_name)[0]
else:
......@@ -174,6 +174,19 @@ def simplify_species(species_name):
return out.strip()
def verify_species(species_name):
"""
Verify if species name exists of "<Family> <species>" format
:param species_name: species name
:return: Boolean
"""
pattern = re.compile(r'^[A-Z][a-z]+ [a-z]+$')
if re.match(pattern, species_name):
return True
else:
return False
def strip_leading_quote(species_name):
""""
There are some species names starting with a quote. Remove
......@@ -306,3 +319,25 @@ def get_raster_vals(coords, rast_src, nominal=False):
rast_vals = rast.point_query(coords, rast_src, interpolate=interpolation)
return pd.DataFrame(data={'vals': rast_vals}, index=coords.index)
'''
pattern = re.compile(r' s\.l\.$| subsp\.? | var\.? | aggr\.? | ""| ssp\.? | s\.? | mod\.? | \(')
sp1 = 'Rosa spinosissima' # begint met s
sp2 = 'Rosa spinosissima s.l.'
sp3 = 'Rosa spinosissima subsp. HANS'
sp4 = 'Rosa spinosissima subsp HANS'
sp5 = 'Rosa spinosissima var. HANS'
sp6 = 'Rosa spinosissima var HANS'
sp7 = 'Rosa spinosissima aggr. HANS'
sp8 = 'Rosa spinosissima aggr HANS'
sp8 = 'Rosa spinosissima ssp. HANS'
sp9 = 'Rosa spinosissima ssp HANS'
sp10 = 'Rosa spinosissima s. HANS'
sp11 = 'Rosa spinosissima s HANS'
sp12 = 'Rosa spinosissima mod. HANS'
sp12 = 'Rosa spinosissima mod HANS'
sps = [sp1,sp2,sp3,sp4,sp5,sp6,sp7,sp8,sp9,sp10,sp11,sp12]
[re.split(pattern, x) for x in sps]
'''
pattern = re.compile(r'[A-Z]{1}[a-z]{1,} [a-z]{1,}')
\ No newline at end of file
......@@ -38,6 +38,7 @@ class Doren:
self.species = None # all unique species
self.species2nr = None # dictionary mapping species name to number
self.nr2species = None # reversed dictionary
self.weird_species = None # species non complying to expected format
self.positive_plots = None # plot IDs for plots containing a certain species
self.negative_plots = None # plot IDs for plots not containing a certain species
self.nearby_plots = None # plot IDs for plots within Xm buffer of self.positive_plots
......@@ -49,8 +50,11 @@ class Doren:
self.basename = 'DOREN' # basename for all output
self.verbose = verbose # boolean, report on progress?
self.report = '' # tracking everything in a report
self.request_df = None # df with all requested species
self.req_sp = None # species requested for processing from external source
self.req_found = None # subset of req_sp found in self.species
self.req_not_found = None # subset of req_sp not found in self.species
# base directory for all output
self.requested_species = None #
self.base_out_dir = r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\z_scratch'
# source to shapefile with background images
self.background_shp = r'c:\Users\roelo008\OneDrive - WageningenUR\a_projects\DOREN\geodata\europe_3035.shp'
......@@ -73,11 +77,18 @@ class Doren:
species['species_name_hdr'] = species.species_name_hdr.apply(do.simplify_species)
self.spec = species
# Unique species
# Unique species as attribute (set) and also as a pd.Series as temporary object applying functions to
self.species = set(self.spec.species_name_hdr)
self.species2nr = {v: i for i, v in enumerate(pd.Series(list(self.species)).sort_values())}
sp_series = pd.Series(list(self.species)).sort_values()
self.species2nr = {v: i for i, v in enumerate(sp_series)}
self.nr2species = {v: k for k, v in self.species2nr.items()}
# Verify species list for species name formatting: "Family species"
sp_check = sp_series.apply(do.verify_species)
if not all(sp_check):
print(' found {0} species not complying to expected format'.format(len(sp_check.loc[sp_check])))
self.weird_species = sp_series.loc[~sp_check]
# Read EVA Header data
if sample:
# Always use first 4 rows (for header and comments) plus 500 random rows
......@@ -113,17 +124,28 @@ class Doren:
:return:
"""
# assumed to be CSV sheet for now
# Read csv file with requested species ('typische soorten')
reqs = pd.read_csv(src)
reqs.loc[:, 'foo'] = reqs.wteneschappelijke_soortnaam.astype(str).apply(do.strip_leading_quote)
reqs.loc[:, 'foo'] = reqs.foo.astype(str).apply(do.simplify_species)
requested_species = set(reqs.foo)
match = self.species.intersection(requested_species)
non_match = requested_species.difference(self.species)
self.requested_species = list(requested_species)
reqs.dropna(subset=['wteneschappelijke_soortnaam'], inplace=True)
# Simplify suggested species names
reqs.loc[:, 'species_name_hdr'] = reqs.wteneschappelijke_soortnaam.astype(str).apply(do.strip_leading_quote)
reqs.loc[:, 'species_name_hdr'] = reqs.species_name_hdr.astype(str).apply(do.simplify_species)
# Indicate if species names are matched in EVA database
reqs.loc[:, 'species_in_EVA'] = reqs.species_name_hdr.apply(lambda x: x in self.species)
# NOTE: self.species may change as a result of adding covariables!
# Attach as df
self.request_df = reqs
# Update attributes
self.req_sp = set(self.request_df.species_name_hdr)
self.req_found = self.species.intersection(self.req_sp)
self.req_not_found = self.req_sp.difference(self.species)
msg = 'Read {0} species requested for processing, of which {1} match a species in EVA and {2} do not.'.format(
len(self.requested_species), len(match), len(non_match))
len(self.req_sp), len(self.req_found), len(self.req_not_found))
self.report += msg
if self.verbose:
print(msg)
......@@ -401,7 +423,16 @@ class Doren:
if not os.path.isdir(out_dir):
os.mkdir(out_dir)
if what == 'species_list':
if what == 'typische_soorten':
'''Write requested species to file'''
csv_out_dir = os.path.join(out_dir, 'csv')
if not os.path.isdir(csv_out_dir):
os.mkdir(csv_out_dir)
req_out_name = '{0}_{1}_typische_soorten.csv'.format(self.basename, self.timestamp)
self.request_df.to_csv(os.path.join(csv_out_dir, req_out_name), sep=',', index=False)
elif what == 'species_list':
'''write species list and frequencies to file'''
csv_out_dir = os.path.join(out_dir, 'csv')
......@@ -568,6 +599,7 @@ class Doren:
def update_status(self, covar=None):
"""
Update status of the object
covar: name of covariable to be added to self.status['covars']
:return: updates self.status
"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment