diff --git a/src/curve_building/build_species_list_per_habitattype.py b/src/curve_building/build_species_list_per_habitattype.py index c241bd65ec048d2f06d76cf0ef25c474bca0a8a8..6b6efd4d706f5b58e4b9ce448b4b7577dcc23363 100644 --- a/src/curve_building/build_species_list_per_habitattype.py +++ b/src/curve_building/build_species_list_per_habitattype.py @@ -7,6 +7,7 @@ from src.preparation.settings import DOREN1_REFERENCE_DATA, PREPARED_DATA """ Dit script bouwt een Excel file waarin de GEVRAAGDE soorten worden vergeleken met de soorten uit EVA. +# TODO: verdere uitleg van dit script. """ @@ -16,7 +17,7 @@ HEADER_COUNT_THRESHOLD = ( 100 # minimum nr of headers in <timestamp>_eva_headers.csv containing the species ) -# Read brondata +# Read brondata. doren1_kwalificerend = pd.read_excel(DOREN1_REFERENCE_DATA, sheet_name="Sheet1_flat") habitattype_typisch = pd.read_excel( @@ -34,17 +35,17 @@ rode_lijst = pd.read_excel(DOREN1_REFERENCE_DATA, sheet_name="rode_lijst_flora") counts_file = os.path.join( os.path.dirname(PREPARED_DATA["prepared_headers"]), - f"{os.path.basename(PREPARED_DATA['prepared_headers'])[:18]}species_lists.csv", + f"{os.path.basename(PREPARED_DATA['prepared_headers'])[:25]}_species_list_summary.csv", ) counts_w_subspecies = pd.read_csv( counts_file, usecols=["matched_concept", "pre_filter", "post_filter"], ) # Harmoniseer diverse subsoort-aanduidingen naar "ssp." -counts_w_subspecies["matched_concept"] = counts_w_subspecies.matched_concept.str.replace( - "(ssp\.|subsp\.|s\.l\.|s\.)", "ssp.", regex=True +counts_w_subspecies["matched_concept_harmonized"] = counts_w_subspecies.matched_concept.str.replace( + "(ssp\.|subsp\.|var\.|s\.)", "ssp.", regex=True ) -counts_w_subspecies.set_index("matched_concept", drop=True, inplace=True) +counts_w_subspecies.set_index("matched_concept_harmonized", drop=True, inplace=True) counts_wo_subspecies = pd.read_csv( counts_file, @@ -52,7 +53,7 @@ counts_wo_subspecies = pd.read_csv( index_col="simplified", ).dropna(subset=["post_filter_simplified"], how="all") -# Concatenate kwalifcierende en typische informatie +# Concatenate kwalifcierende en typische-soorten dat = pd.concat( [ doren1_kwalificerend.loc[:, ["wetenschappelijke_naam", "Habitattype", "Wat"]], @@ -60,13 +61,12 @@ dat = pd.concat( ], ignore_index=True, ) -# s.l. sensu-lato in brede zin. Verwijder deze aanduiding, zodat de originele soortnaam wordt gebruikt -dat.wetenschappelijke_naam = dat.wetenschappelijke_naam.str.replace(' s.l.', '') -# Add alternatieve naam. Alternatieve namen zijn ALLEEN bepaald voor gevraagde soorten die 0x voorkomen in de EVA database. -dat["wetenschappelijke_naam_synoniem"] = dat.wetenschappelijke_naam.map( +# Add synoniemen, deze zijn door WW en FvdZ bepaald voor ALLEEN gevraagde soorten die 0x voorkomen in de EVA database +# Verwijder hier ook de s.l. sensu lato toevoeging. +dat["wetenschappelijke_naam_synoniem"] = dat.wetenschappelijke_naam.str.replace(' s.l.', '').map( synoniemen -).fillna(dat.wetenschappelijke_naam) +).fillna(dat.wetenschappelijke_naam.str.replace(' s.l.', '')) # Add counts of each species in the EVA database. Distinguish count pre- and post filtering of EVA database # Subspecies are searched for in EVA database as is. Example: @@ -122,6 +122,17 @@ queries = { for k, v in queries.items(): dat[k] = dat.index.isin(dat.query(v).index) +# Dataframe with unique species that comply to >=1 variant(en) +uniek_gevraagd = dat.loc[(dat.variant01) + | (dat.variant02) + | (dat.variant03) + | (dat.variant04),'wetenschappelijke_naam'].drop_duplicates(ignore_index=True) +uniek_matched = uniek_gevraagd.map(dat.set_index('wetenschappelijke_naam').wetenschappelijke_naam_synoniem.to_dict()) +uniek_as_in_eva = uniek_matched.map(counts_w_subspecies.matched_concept.to_dict()) +uniek_df = pd.DataFrame({'gevraagd': uniek_gevraagd, + 'matchend': uniek_matched, + 'as_eva': uniek_as_in_eva}).sort_values(by='as_eva') + # Write contents to Excel file with (pd.ExcelWriter( os.path.join( @@ -179,18 +190,4 @@ with (pd.ExcelWriter( ) # Alle unieke soortnamen die in tenminste een variant voorkomen. Gebruik synoniemen, want die zitten altijd in EVA. - u = ( - pd.Series( - pd.unique( - dat.loc[ - (dat.variant01) - | (dat.variant02) - | (dat.variant03) - | (dat.variant04), - "wetenschappelijke_naam_synoniem", - ] - ) - ) - .sort_values() - .to_excel(dest, index=False, sheet_name="soortenlijst_voor_EVA_queries") - ) + uniek_df.to_excel(dest, index=False, sheet_name="soortenlijst_voor_EVA_queries") diff --git a/src/curve_building/response_curve_data_delivery.py b/src/curve_building/response_curve_data_delivery.py index 49f23b6180afef12c499027b3bbd3b0bd41d64dc..a88021c0a1a01a7bd110d8ee51a95af81a9cb9be 100644 --- a/src/curve_building/response_curve_data_delivery.py +++ b/src/curve_building/response_curve_data_delivery.py @@ -70,7 +70,9 @@ def species_data_file( print(f"building species response file for {identifier}.") # Identify plots with the species - plots.identify_headers_with_species(species_identifier=identifier) + # identifier = 'Agrostis canina' + plots.identify_headers_with_species(species_identifier=identifier, + include_plots_with_subspecies=True) # Get bedekking as series. Query the Species inventory dataframe for: # First the selected plot-observation IDs @@ -134,8 +136,12 @@ if __name__ == "__main__": plots = DorenPlots(source_data=SOURCE_DATA, prepared_data=PREPARED_DATA) for species in args.ids: - df = species_data_file(plots=plots, identifier=species) - out_file_name = f'{species.replace(" ", "_")}_plot_response_file.csv' + try: + df = species_data_file(plots=plots, identifier=species) + except ValueError as e: + print(e) + continue + out_file_name = f'{slugify(species)}_plot_response_file.csv' with open(os.path.join(args.out_dir, out_file_name), "w") as f: # with open(os.path.join(r'c:/apps/z_temp', out_file_name), 'w') as f: f.write( diff --git a/src/interactive_analysis_species/header_queries.py b/src/interactive_analysis_species/header_queries.py index 2b933bee133e40f52c1711b33a35dd0a9682a4a8..205d3b80766bdb020a4d92def071abf39bf0e519 100644 --- a/src/interactive_analysis_species/header_queries.py +++ b/src/interactive_analysis_species/header_queries.py @@ -73,6 +73,7 @@ class DorenPlots: def identify_headers_with_species( self, species_identifier: str, + include_plots_with_subspecies: bool=False ): """ Mark headers in dataframe which contain a species. Adds or overwrites column 'has_species' @@ -82,10 +83,11 @@ class DorenPlots: :return: dataframe with headers """ + query = {True: f'matched_concept == "{species_identifier}" or matched_concept_simplified == "{species_identifier}"', + False: f'matched_concept == "{species_identifier}"'}[include_plots_with_subspecies] + plots = set( - self.species_df.query( - f"matched_concept == '{species_identifier}'" - ).plot_obs_id + self.species_df.query(query).plot_obs_id ) self.header_gdf = self.header_gdf.assign( has_species=self.header_gdf.index.isin(plots) diff --git a/src/preparation/build_header_database.py b/src/preparation/build_header_database.py index 7f5b316cb6293406e398635b4ea81489ef36beea..1b060a364e72a456a5bff6429cc8030b7ee0400a 100644 --- a/src/preparation/build_header_database.py +++ b/src/preparation/build_header_database.py @@ -102,13 +102,18 @@ def buid_header_database(sample: bool, basename: str, ndep_averaging_period: int ) # Write species list to file - species_list_summary = prep.species_list_(df=df) + # species_list_summary = prep.species_list_(df=df) + # species_list_summary.to_csv( + # os.path.join(out_dir, f'{ts}_{basename}_species_list_summary.csv'), + # index_label='matched_concept', sep=',' + # ) + + # Copy and write SOURCE_DATA["eva_species"] to output dir, restricted to selected plots only. + species_list = prep.eva_species_list_selected(df=df) species_list_summary.to_csv( - os.path.join(out_dir, f'{ts}_{basename}_species_list_summary.csv'), - index_label='matched_concept', sep=',' - ) - - # OPTIONAL TO DO: copy SOURCE_DATA["eva_specieS"] and write back to file with only selected headers. + os.path.join(out_dir, f'{ts}_{basename}_species_list.csv'), + index=False + ) print( f'Done @ {datetime.datetime.now().strftime("%d %b %Y %H:%M:%S")}. See results: {BASE_OUT_DIRECTORY}\\{basename}!' diff --git a/src/preparation/header_management_functions.py b/src/preparation/header_management_functions.py index a58741ddd8fb693682558156f819ac76088883e0..60532b1c05883643d48ea54180dac484bcc4e7f0 100644 --- a/src/preparation/header_management_functions.py +++ b/src/preparation/header_management_functions.py @@ -56,44 +56,48 @@ def get_headers( df = ( pd.read_csv( headers_src, - sep=",", + sep="\t", header=1, comment="#", usecols=[ - "plot_id", - # "tv2_releve_nr", + "PlotObservationID", "longitude", "latitude", "year", "dataset", - "eunis_old", - "eunis_new", + "expert_system", + # "eunis_old", + # "eunis_new", ], + # Note: assumes columns in source data are in this order! + # Headers are: PlotObservationID, PlotID, TV2 relevé number, Country, Cover abundance scale, Date of recording, Relevé area (m²), Aspect (°), Slope (°), Altitude (m), Expert System, Longitude, Latitude, Location uncertainty (m), Dataset, Access regime names=[ + "PlotObservationID", "plot_id", "tv2_releve_nr", "country", "cover_abundance_scale", "year", "releve_area_m2", - "altitude_m", "aspect_deg", "slope_deg", - "eunis_old", - "eunis_new", + "altitude_m", + "expert_system", "longitude", "latitude", "location_uncertainty_m", "dataset", + "acces_regime" ], - na_values={"eunis_old": "?", "eunis_new": ["?", "I"]}, + # na_values={"eunis_old": "?", "eunis_new": ["?", "I"]}, + na_values={'expert_system': '~'}, ) - .drop_duplicates("plot_id") - .set_index("plot_id", drop=True) - .dropna(axis=0, how="any", subset=["latitude", "longitude", "year"]) - .dropna(axis=0, how="all", subset=["eunis_old", "eunis_new"]) + .drop_duplicates("PlotObservationID") + .set_index("PlotObservationID", drop=True) + .dropna(axis=0, how="any", subset=["latitude", "longitude", "year", 'expert_system']) + # .dropna(axis=0, how="all", subset=["eunis_old", "eunis_new"]) .astype({"year": int}) - ) + ).rename(columns={'expert_system': 'eunis_code'}) update = f"Reading headers from file {headers_src}, {df.shape[0]:,} remaining after dropping NAs in latitude, longitude and date fields and (eunis_old AND eunis_new)." print(update) return (df, f"{msg}\n{update}", df.shape[0]) @@ -1019,7 +1023,7 @@ def simplify_species(species_name: str) -> str: :param species_name: :return: species_name minus subsp or other flags """ - pattern = re.compile(r'subsp.? | var.? | aggr.? | ""| ssp.? | s.? | mod.? | \(| aggr\.?$| sensu\.? ') + pattern = re.compile(r'subsp.? | var.? | aggr.? | ssp.? | s.? | mod.? | \(| aggr\.?$| sensu\.? ') if re.search(pattern, species_name): out = re.split(pattern, species_name)[0] else: @@ -1027,6 +1031,28 @@ def simplify_species(species_name: str) -> str: return out.strip() +def eva_species_list_selected(df: pd.DataFrame) -> pd.DataFrame: + """ + Read the EVA species list and return with selected plots only + Parameters + ---------- + df: header dataframe + + Returns + ------- + + """ + + species = pd.read_csv( + SOURCE_DATA["eva_species"], + sep="\t", + usecols=["PlotObservationID", "Matched concept", "Cover %", "Cover code"], + header=0, + ) + + return species.loc[species.PlotObservationID.isin(df.index), :] + + def species_list_(df: pd.DataFrame) -> pd.DataFrame: """ Build dataframe with species lists. Columns are: @@ -1073,8 +1099,17 @@ def species_list_(df: pd.DataFrame) -> pd.DataFrame: species['simplified_concept'] = species.matched_concept.map(simplifier) df01['simplified'] = df01.index.map(simplifier) - simplified_concept_count_pre = pd.value_counts(species.simplified_concept) - simplified_concept_count_post = pd.value_counts(species.loc[species.selected == 1, 'simplified_concept']) + """ + By simplifying species names, a plot can contain the same species twice. + Eg + plot_obs_id matched_concept simplified_concept + 1034043 Agrostis canina Agrostis canina + 1034043 Agrostis canina aggr. Agrostis canina + 1034043 Anthoxanthum odoratum Anthoxanthum odoratum + Therefore drop duplicates + """ + simplified_concept_count_pre = pd.value_counts(species.drop_duplicates(subset=['plot_obs_id', 'simplified_concept']).simplified_concept) + simplified_concept_count_post = pd.value_counts(species.drop_duplicates(subset=['plot_obs_id', 'simplified_concept']).loc[species.selected == 1, 'simplified_concept']) df02 = pd.DataFrame({'pre_filter': simplified_concept_count_pre, 'post_filter': simplified_concept_count_post}).fillna(0) @@ -1091,10 +1126,10 @@ def species_list_(df: pd.DataFrame) -> pd.DataFrame: # lst2 = set([simplify_species(s) for s in lst]) # pd.Series(list(lst2)).sort_values().to_clipboard(index=False) # -# pivot = pd.read_excel(r'c:\Users\roelo008\Wageningen University & Research\DOREN 2022 - fase01 - verkenning\WP06_soorenselectie_habitat\referentie_doren1\relaties_tussen_de_hoeveelheid_stikstofdepositie_-hydrotheek_(stowa)_547784.xlsx', -# sheet_name='Sheet1_copy') -# pivot2 = pivot.drop(columns=['soortnr', 'Nederlandse naam', 'EU-naam', 'count']) -# flat = pd.melt(pivot2.drop_duplicates(subset=['wetenschappelijke naam']), id_vars='wetenschappelijke naam').dropna(subset='value') -# flat['Wat'] = flat.value.map({1: 'kwalificerend', 2: 'kwalificerend en verdringend', 3: 'verdringend'}) -# flat['Habitattype'] = 'H' + flat.variable.str.replace('-', '_').fillna(flat.variable.astype(str)) -# flat.to_clipboard(index=False, sep=',') +pivot = pd.read_excel(r'c:\Users\roelo008\Wageningen University & Research\DOREN 2022 - fase01 - verkenning\WP06_soorenselectie_habitat\referentie_doren1\relaties_tussen_de_hoeveelheid_stikstofdepositie_-hydrotheek_(stowa)_547784.xlsx', + sheet_name='Sheet1_copy') +pivot2 = pivot.drop(columns=['soortnr', 'Nederlandse naam', 'EU-naam', 'count']) +flat = pd.melt(pivot2.drop_duplicates(subset=['wetenschappelijke naam']), id_vars='wetenschappelijke naam').dropna(subset='value') +flat['Wat'] = flat.value.map({1: 'kwalificerend', 2: 'kwalificerend en verdringend', 3: 'verdringend'}) +flat['Habitattype'] = 'H' + flat.variable.str.replace('-', '_').fillna(flat.variable.astype(str)) +flat.to_clipboard(index=False, sep=',') diff --git a/src/preparation/settings.py b/src/preparation/settings.py index c6a0911c9f34eb11db714b1c0ee5fbeb8b69548c..3690fe1fe57161b55e0d30bd852b182d3d590cdd 100644 --- a/src/preparation/settings.py +++ b/src/preparation/settings.py @@ -4,8 +4,10 @@ SOURCE_DATA = { "doren_master_excel": r"c:\Users\{0}\Wageningen University & Research\DOREN 2022 - fase01 - verkenning\WP01_inzicht\DOREN_2022_masterfile.xlsx".format( os.environ.get("username") ), - "eva_headers": r"w:\PROJECTS\Doren19\a_brondata\EVA\delivery_20201118\EVA_Doren_header.csv", - "eva_species": r"w:\PROJECTS\Doren19\a_brondata\EVA\delivery_201909\EVA_Doren_species.csv", + # "eva_headers": r"w:\PROJECTS\Doren19\a_brondata\EVA\delivery_20201118\EVA_Doren_header.csv", + # "eva_species": r"w:\PROJECTS\Doren19\a_brondata\EVA\delivery_201909\EVA_Doren_species.csv", + "eva_headers": r'w:\projects\DOREN22\a_sourcedata\EVA\delivery_Hennekens_20231016\eva_2023_10_16_header.csv', + "eva_species": r"w:\projects\DOREN22\a_sourcedata\EVA\delivery_Hennekens_20231016\eva_2023_10_16_species.csv", "aoi": r"W:\PROJECTS\Doren19\a_brondata\AOI\ne_50m_cntrs_AOI_diss_fin.shp", "elevation": r"W:\PROJECTS\Doren19\a_brondata\covariables\DEM\DTM_3035.tif", "soil_map": r"w:\PROJECTS\Doren19\a_brondata\covariables\soil\b_processed\WRBLEV1_laea.tif", @@ -133,7 +135,7 @@ FILTERS = { } PREPARED_DATA = { - "prepared_headers": r"w:\projects\DOREN22\b_prepareddata\eva_headers\20230222-1254_eva_headers.csv", + "prepared_headers": r"w:\projects\DOREN22\b_prepareddata\eva_headers\eva_headers_20231011-1420\20231011-1420_eva_headers.csv", } BASE_OUT_DIRECTORY = r"w:\PROJECTS\DOREN22\\b_prepareddata\\eva_headers"