From f7d41077b1f4c1813c8f7d4311e5337b83100212 Mon Sep 17 00:00:00 2001 From: Allard de Wit <allard.dewit@wur.nl> Date: Thu, 15 Apr 2021 23:28:09 +0200 Subject: [PATCH] Several changes: - Added documentation on grompy cmd in readme.md - Added a pyproject.toml for installing with flit - Added a license --- .gitignore | 6 +- LICENSE | 21 +++++ README.md | 150 ++++++++++++++++++++++++++++++++- grompy/__init__.py | 8 +- grompy/{__main__.py => cmd.py} | 0 pyproject.toml | 21 +++++ 6 files changed, 202 insertions(+), 4 deletions(-) create mode 100644 LICENSE rename grompy/{__main__.py => cmd.py} (100%) create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore index fda2af7..25a5b15 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ runfiles/* -.idea/*.xml +.idea/ __pycache__/ -notebooks/.ipynb_checkpoints/* \ No newline at end of file +notebooks/.ipynb_checkpoints/* +/test.py +dist/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c43b5b2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2021 Allard de Wit + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md index b0c2470..b472466 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,151 @@ # grompy -Tools for working with parcel-based satellite timeseries from groenmonitor.nl \ No newline at end of file +Tools for working with parcel-based satellite time-series observations from groenmonitor.nl + +## Introduction + +Grompy is a tool to process and access parcel-based satellite observations from GroenMonitor.nl. +It was developed because accessing the satellite time-series for a parcel was cumbersome using +the existing data structure. Moreover querying groenmonitor data from AgroDataCube is slow and inflexible +for researchers who just want to play around with the data. Instead, grompy allows fast and easy access to +all parcel observations and can provide simultaneous access to parcel info, optical and radar observations. + +The grompy package consists of two components: + + 1. A commandline tool (e.g. `grompy`) to define/check groenmonitor CSV files and finally load them + into (SQLite) database tables. + 2. The python package `grompy` which provides `grompy.DataAccesProvider` which can be used to efficiently + access the time-series data stored in the database. + +## Command line tool + +### initializing + +The `grompy` command can be used to load parcel information and groenmonitor CSV files with parcel observation +into a database structure. +For this purpose a file `grompy.yaml` is required which provides the information required to process all inputs. +This includes the paths to the different CSV files, the path to the shapefile with parcel information and +the URI for the database where the data have to be written. The `grompy.yaml` file is the entry point for all +other grompy operations as well as the `DataAccessProvider`. + +The `grompy.yaml` file can be generated with the command `grompy init <data path>` and for doing so, grompy assumes +a certain folder structure which looks like this: +``` +<data path> /BRP/gewaspercelen_<year>.shp + /Optisch/ - CSV files with sentinel2 data + /Radar/ - CSV with radar data +``` + +In practice it is most convenient to keep the `grompy.yaml` file together with the data. So change directory +to the data folder and execute: +```commandline +cd <data path> +grompy init . +``` + +The init command creates the `grompy.yaml` and sets the path to the inputs/outputs based on the input +for `<data path>`. In this case, the current directory `.`. The `grompy.yaml` now looks like this: +```yaml +grompy: + version: 1.0 +parcel_info: + dsn: sqlite:////home/wit015/Data/groenmonitor/parcel_info.db3 + counts_file: /home/wit015/Data/groenmonitor/Optisch/perceelscount.csv + shape_file: /home/wit015/Data/groenmonitor/BRP/gewaspercelen_2019.shp + table_name: parcel_info +datasets: + sentinel2_reflectance_values: + dsn: sqlite:////home/wit015/Data/groenmonitor/sentinel2_reflectance_values.db3 + bands: + NDVI: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_2019_ADC.csv + B02: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B02_2019_ADC.csv + B03: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B03_2019_ADC.csv + B04: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B04_2019_ADC.csv + B05: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B05_2019_ADC.csv + B06: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B06_2019_ADC.csv + B07: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B07_2019_ADC.csv + B08: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B08_2019_ADC.csv + B11: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B11_2019_ADC.csv + B12: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B12_2019_ADC.csv + B8A: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B8A_2019_ADC.csv + sentinel2_reflectance_std: + dsn: sqlite:////home/wit015/Data/groenmonitor/sentinel2_reflectance_std.db3 + bands: + NDVI: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_std_2019_ADC.csv + B02: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_std_B02_2019_ADC.csv +... +``` +The `grompy.yaml` file specifies several sections: + + - A first section describing the grompy version + - the 'parcel_info' section providing the path to the shapefile with parcel information as well + as a 'counts_file' (see below) which provides for each parcel the satellite pixel count. Finally, the data + source name (dsn) for the database to write to, and the name of the output table. + - the 'datasets' section which provides the database dsn and the paths to the different CSV files + belonging to the dataset. The name of the dataset will be used for the output table in the database + while the names of the CSV files will be used as the table column names. Note that the number of datasets + can be variable as well as the number of CSV files in a dataset. e.g. you can just add datasets or paths + to CSV files within a dataset. This aspect provides a lot of flexibility. + +After the `grompy.yaml` has been created you can go to the next step. + +**note**: the counts file is a CSV file with two columns: the `fieldID` column and the `count` column which +represents the number of useable pixels in the parcel (excluding border pixels). The counts file can be most +easily generated by taking one of the input CSV files (which also include the pixel count, but this is ignored +during loading) and generate the counts file with `awk`: +```commandline +cat <CSV file> | awk 'BEGIN{FS=","}{print $1, $2}' > <counts_file.csv> +``` + +### checking + +The `grompy.yaml` is a relatively complex input structure and manually checking all paths is rather cumbersome. +Therefore, grompy can check if the YAML file is OK by executing: + +```commandline +grompy check +``` +It assumes that the `grompy.yaml` resides in the current directory. Grompy will now read the YAML and carry out +several checks, including: + - If files exists. + - If connections to database can be opened. + - If the CSV files of the different datasets all have the same number of lines. + +Grompy will display a lot of output on the screen. If everything is fine, the last line will show: +```commandline +OK! All inputs seem fine. +``` +If not, open the YAML file with a text editor and correct any problems that are found manually. Next, rerun +`grompy check` to see if all errors are gone. Now we are ready for the final step. + +**note:** You cannot skip the `grompy check` step because it modifies the YAML file and adds some additional +information to it. Running `grompy load` on an unchecked `grompy.yaml` will result in grompy asking you to +run `grompy check` first. + +### loading + +The final step is to load the parcel information and satellite observations into the database tables. This can be +done with the `grompy load` command. Also here grompy assumes that the `grompy.yaml` file resides in the current +directory. Grompy will now show the following output: +```commandline +Start loading parcel information. This will take some time... +Starting loading of: sentinel1_backscatter +Starting loading of: sentinel1_coherence +Starting loading of: sentinel2_reflectance_std +Starting loading of: sentinel2_reflectance_values + |--------------------------------------------------| 0.01% +``` + +In the first stage, grompy will load the parcel information. It uses geopandas to load the shapefile and +it will take some time to complete this operation. Next, it will start loading the datasets. Loading of +data into the database can easily take several hours depending on the speed of the underlying hardware. +Moreover, loading of datasets is done in parallel. Grompy will start as many parallel processes as there are +datasets defined in the `grompy.yaml` files. Therefore, grompy should only be applied on machines with +sufficient cores and writing should be done to different database files (in case of SQLite) or a database +server with sufficient capacity to handle multiple streams of data. Note that grompy can write all information +into one SQLite database, but write locks on the database will cause delays in processing so this is not +recommended. + + +## Accessing data processed by grompy + diff --git a/grompy/__init__.py b/grompy/__init__.py index 0aa1114..0f270ca 100644 --- a/grompy/__init__.py +++ b/grompy/__init__.py @@ -1,4 +1,10 @@ # -*- coding: utf-8 -*- # Copyright (c) 2021 Wageningen Environmental Research # Allard de Wit (allard.dewit@wur.nl), April 2021 -from .dap import DataAccessProvider \ No newline at end of file +"""Grompy is a tool to process and access parcel-based satellite observations from GroenMonitor.nl. +""" +from .dap import DataAccessProvider +from .cmd import cli + + +__version__ = "1.0.0" \ No newline at end of file diff --git a/grompy/__main__.py b/grompy/cmd.py similarity index 100% rename from grompy/__main__.py rename to grompy/cmd.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1448587 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[build-system] +requires = ["flit_core >=2,<4"] +build-backend = "flit_core.buildapi" + +[tool.flit.metadata] +module = "grompy" +author = "Allard de Wit" +author-email = "allard.dewit@wur.nl" +home-page = "http://www.earthinformatics.eu" +classifiers = [ "License :: OSI Approved :: MIT License",] +description-file = "README.md" +requires = [ + "click>=7.1", + "geopandas>= 0.8", + "pyyaml>= 5.4", + "sqlalchemy>= 1.4", +] +requires-python=">= 3.6" + +[tool.flit.scripts] +grompy = "grompy:cli" \ No newline at end of file -- GitLab