other small updates

c1872321 · Roelofsen, Hans · 5f806341 · c1872321 · c1872321 · c1872321
Commit c1872321 authored 1 year ago by Roelofsen, Hans
--- a/src/interactive_analysis_species/header_queries.py
+++ b/src/interactive_analysis_species/header_queries.py
 import os
 import json
+import re
+
 import pandas as pd
 import numpy as np
 import geopandas as gp
@@ -47,12 +49,14 @@ def ckdnearest(gdA: gp.GeoDataFrame, gdB: gp.GeoDataFrame):

 class DorenPlots:
    def __init__(self, source_data: dict, prepared_data: dict):
+
        self.species_df = pd.read_csv(
-            source_data["eva_species"],
-            sep="\t",
+            prepared_data["eva_species"],
+            sep=",",
            usecols=[0, 5, 8],
            header=0,
            names=["plot_obs_id", "matched_concept", "cover_percentage"],
+            comment='#',
        )

        # Add simplified species name
@@ -70,9 +74,22 @@ class DorenPlots:
            self.preparation_menu = json.load(f)
        self.preparation_menu['prepared_headers'] = prepared_data['prepared_headers']

+    def get_plots_inventory_for_species(self, species_identifier: str) -> pd.DataFrame:
+
+        pattern = re.compile("s.l.|sl.|var|var.|agg.|agg|aggr.|aggr|sensu")
+        sel = self.species_df.query(
+            f"matched_concept == '{species_identifier}' or matched_concept_simplified == '{re.split(pattern, species_identifier)[0].strip()}'"
+        )
+        if not sel.empty:
+            print(f'  found {sel.shape[0]} ({len(set(sel.plot_obs_id))}) plots for {species_identifier}')
+            return sel
+        else:
+            raise ValueError(f"No plots found for {species_identifier}")
+
    def identify_headers_with_species(
        self,
        species_identifier: str,
+            shout: bool=False
    ):
        """
        Mark headers in dataframe which contain a species. Adds or overwrites column 'has_species'
@@ -82,18 +99,30 @@ class DorenPlots:
        :return: dataframe with headers
        """

-        plots = set(
+        # Match requested species name directly on matched concept column
+        plots1 = set(
            self.species_df.query(
                f"matched_concept == '{species_identifier}'"
            ).plot_obs_id
        )
+
+        # Simplify requested species name and search on simplified matched_concept column
+        pattern = re.compile(" s\.l\.| sl\. | var | var\. |agg\. | agg | aggr\. | aggr | sensu ")
+        plots2 = set(self.species_df.query(
+            f"matched_concept_simplified == '{re.split(pattern, species_identifier)[0].strip()}'"
+        ).plot_obs_id)
+
        self.header_gdf = self.header_gdf.assign(
-            has_species=self.header_gdf.index.isin(plots)
+            has_species=self.header_gdf.index.isin(plots1.union(plots2))
        )

        if self.header_gdf.has_species.sum() == 0:
            raise ValueError(f"No plots found for {species_identifier}")

+        else:
+            if shout:
+                print(f'# Found {len(plots1)} plots directly and {len(plots2)} indirectly for {species_identifier}')
+
    def identify_headers_of_structuurtype(self, strucuurtype: str):
        """ "
        Mark headers in dataframe which belong to the requested structuurtype

--- a/src/preparation/header_management_functions.py
+++ b/src/preparation/header_management_functions.py
--- a/src/preparation/settings.py
+++ b/src/preparation/settings.py
@@ -134,6 +134,7 @@ FILTERS = {

 PREPARED_DATA = {
    "prepared_headers": r"w:\projects\DOREN22\b_prepareddata\eva_headers\20230222-1254_eva_headers.csv",
+    "eva_species": r'W:\PROJECTS\DOREN22\b_prepareddata\eva_headers\20230222-1254_eva_species.csv'
 }

 BASE_OUT_DIRECTORY = r"w:\PROJECTS\DOREN22\\b_prepareddata\\eva_headers"