From f7d41077b1f4c1813c8f7d4311e5337b83100212 Mon Sep 17 00:00:00 2001
From: Allard de Wit <allard.dewit@wur.nl>
Date: Thu, 15 Apr 2021 23:28:09 +0200
Subject: [PATCH] Several changes: - Added documentation on grompy cmd in
 readme.md - Added a pyproject.toml for installing with flit - Added a license

---
 .gitignore                     |   6 +-
 LICENSE                        |  21 +++++
 README.md                      | 150 ++++++++++++++++++++++++++++++++-
 grompy/__init__.py             |   8 +-
 grompy/{__main__.py => cmd.py} |   0
 pyproject.toml                 |  21 +++++
 6 files changed, 202 insertions(+), 4 deletions(-)
 create mode 100644 LICENSE
 rename grompy/{__main__.py => cmd.py} (100%)
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
index fda2af7..25a5b15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 runfiles/*
-.idea/*.xml
+.idea/
 __pycache__/
-notebooks/.ipynb_checkpoints/*
\ No newline at end of file
+notebooks/.ipynb_checkpoints/*
+/test.py
+dist/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..c43b5b2
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2021 Allard de Wit
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
index b0c2470..b472466 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,151 @@
 # grompy
 
-Tools for working with parcel-based satellite timeseries from groenmonitor.nl
\ No newline at end of file
+Tools for working with parcel-based satellite time-series observations from groenmonitor.nl
+
+## Introduction
+
+Grompy is a tool to process and access parcel-based satellite observations from GroenMonitor.nl. 
+It was developed because accessing the satellite time-series for a parcel was cumbersome using 
+the existing data structure. Moreover querying groenmonitor data from AgroDataCube is slow and inflexible
+for researchers who just want to play around with the data. Instead, grompy allows fast and easy access to 
+all parcel observations and can provide simultaneous access to parcel info, optical and radar observations. 
+
+The grompy package consists of two components:
+
+ 1. A commandline tool (e.g. `grompy`) to define/check groenmonitor CSV files and finally load them
+    into (SQLite) database tables.
+ 2. The python package `grompy` which provides `grompy.DataAccesProvider` which can be used to efficiently
+    access the time-series data stored in the database.
+    
+## Command line tool
+
+### initializing
+
+The `grompy`  command can be used to load parcel information and groenmonitor CSV files with parcel observation 
+into a database structure.
+For this purpose a file `grompy.yaml` is required which provides the information required to process all inputs.
+This includes the paths to the different CSV files, the path to the shapefile with parcel information and
+the URI for the database where the data have to be written. The `grompy.yaml` file is the entry point for all
+other grompy operations as well as the `DataAccessProvider`.
+
+The `grompy.yaml` file can be generated with the command `grompy init <data path>` and for doing so, grompy assumes 
+a certain folder structure which looks like this:
+```
+<data path> /BRP/gewaspercelen_<year>.shp
+            /Optisch/ - CSV files with sentinel2 data
+            /Radar/ - CSV with radar data 
+```
+
+In practice it is most convenient to keep the `grompy.yaml` file together with the data. So change directory
+to the data folder and execute:
+```commandline
+cd <data path>
+grompy init .
+```
+
+The init command creates the `grompy.yaml` and sets the path to the inputs/outputs based on the input
+for `<data path>`. In this case, the current directory `.`. The `grompy.yaml` now looks like this:
+```yaml
+grompy:
+  version: 1.0
+parcel_info:
+  dsn: sqlite:////home/wit015/Data/groenmonitor/parcel_info.db3
+  counts_file: /home/wit015/Data/groenmonitor/Optisch/perceelscount.csv
+  shape_file: /home/wit015/Data/groenmonitor/BRP/gewaspercelen_2019.shp
+  table_name: parcel_info
+datasets:
+  sentinel2_reflectance_values:
+    dsn: sqlite:////home/wit015/Data/groenmonitor/sentinel2_reflectance_values.db3
+    bands:
+      NDVI: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_2019_ADC.csv
+      B02: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B02_2019_ADC.csv
+      B03: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B03_2019_ADC.csv
+      B04: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B04_2019_ADC.csv
+      B05: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B05_2019_ADC.csv
+      B06: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B06_2019_ADC.csv
+      B07: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B07_2019_ADC.csv
+      B08: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B08_2019_ADC.csv
+      B11: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B11_2019_ADC.csv
+      B12: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B12_2019_ADC.csv
+      B8A: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_mean_B8A_2019_ADC.csv
+  sentinel2_reflectance_std:
+    dsn: sqlite:////home/wit015/Data/groenmonitor/sentinel2_reflectance_std.db3
+    bands:
+      NDVI: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_std_2019_ADC.csv
+      B02: /home/wit015/Data/groenmonitor/Optisch/zonal_stats_std_B02_2019_ADC.csv
+...
+```
+The `grompy.yaml` file specifies several sections:
+
+ - A first section describing the grompy version
+ - the 'parcel_info' section providing the path to the shapefile with parcel information as well
+   as a 'counts_file' (see below) which provides for each parcel the satellite pixel count. Finally, the data
+   source name (dsn) for the database to write to, and the name of the output table.
+ - the 'datasets' section which provides the database dsn and the paths to the different CSV files
+   belonging to the dataset. The name of the dataset will be used for the output table in the database
+   while the names of the CSV files will be used as the table column names. Note that the number of datasets
+   can be variable as well as the number of CSV files in a dataset. e.g. you can just add datasets or paths
+   to CSV files within a dataset. This aspect provides a lot of flexibility.
+   
+After the `grompy.yaml` has been created you can go to the next step.
+
+**note**: the counts file is a CSV file with two columns: the `fieldID` column and the `count` column which 
+represents the number of useable pixels in the parcel (excluding border pixels). The counts file can be most
+easily generated by taking one of the input CSV files (which also include the pixel count, but this is ignored
+during loading) and generate the counts file with `awk`:
+```commandline
+cat <CSV file> | awk 'BEGIN{FS=","}{print $1, $2}' > <counts_file.csv>
+```
+
+### checking
+
+The `grompy.yaml` is a relatively complex input structure and manually checking all paths is rather cumbersome.
+Therefore, grompy can check if the YAML file is OK by executing:
+
+```commandline
+grompy check
+```
+It assumes that the `grompy.yaml` resides in the current directory. Grompy will now read the YAML and carry out 
+several checks, including:
+ - If files exists.
+ - If connections to database can be opened.
+ - If the CSV files of the different datasets all have the same number of lines.
+
+Grompy will display a lot of output on the screen. If everything is fine, the last line will show:
+```commandline
+OK! All inputs seem fine.
+```
+If not, open the YAML file with a text editor and correct any problems that are found manually. Next, rerun
+`grompy check` to see if all errors are gone. Now we are ready for the final step.
+
+**note:** You cannot skip the `grompy check` step because it modifies the YAML file and adds some additional
+information to it. Running `grompy load` on an unchecked `grompy.yaml` will result in grompy asking you to 
+run `grompy check` first.
+
+### loading
+
+The final step is to load the parcel information and satellite observations into the database tables. This can be
+done with the `grompy load` command. Also here grompy assumes that the `grompy.yaml` file resides in the current
+directory. Grompy will now show the following output:
+```commandline
+Start loading parcel information. This will take some time...
+Starting loading of: sentinel1_backscatter
+Starting loading of: sentinel1_coherence
+Starting loading of: sentinel2_reflectance_std
+Starting loading of: sentinel2_reflectance_values
+ |--------------------------------------------------| 0.01% 
+```
+
+In the first stage, grompy will load the parcel information. It uses geopandas to load the shapefile and 
+it will take some time to complete this operation. Next, it will start loading the datasets. Loading of 
+data into the database can easily take several hours depending on the speed of the underlying hardware.
+Moreover, loading of datasets is done in parallel. Grompy will start as many parallel processes as there are 
+datasets defined in the `grompy.yaml` files. Therefore, grompy should only be applied on machines with
+sufficient cores and writing should be done to different database files (in case of SQLite) or a database
+server with sufficient capacity to handle multiple streams of data. Note that grompy can write all information
+into one SQLite database, but write locks on the database will cause delays in processing so this is not
+recommended.
+
+
+## Accessing data processed by grompy
+
diff --git a/grompy/__init__.py b/grompy/__init__.py
index 0aa1114..0f270ca 100644
--- a/grompy/__init__.py
+++ b/grompy/__init__.py
@@ -1,4 +1,10 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2021 Wageningen Environmental Research
 # Allard de Wit (allard.dewit@wur.nl), April 2021
-from .dap import DataAccessProvider
\ No newline at end of file
+"""Grompy is a tool to process and access parcel-based satellite observations from GroenMonitor.nl.
+"""
+from .dap import DataAccessProvider
+from .cmd import cli
+
+
+__version__ = "1.0.0"
\ No newline at end of file
diff --git a/grompy/__main__.py b/grompy/cmd.py
similarity index 100%
rename from grompy/__main__.py
rename to grompy/cmd.py
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..1448587
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,21 @@
+[build-system]
+requires = ["flit_core >=2,<4"]
+build-backend = "flit_core.buildapi"
+
+[tool.flit.metadata]
+module = "grompy"
+author = "Allard de Wit"
+author-email = "allard.dewit@wur.nl"
+home-page = "http://www.earthinformatics.eu"
+classifiers = [ "License :: OSI Approved :: MIT License",]
+description-file = "README.md"
+requires = [
+    "click>=7.1",
+    "geopandas>= 0.8",
+    "pyyaml>= 5.4",
+    "sqlalchemy>= 1.4",
+]
+requires-python=">= 3.6"
+
+[tool.flit.scripts]
+grompy = "grompy:cli"
\ No newline at end of file
-- 
GitLab