diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e310e24afae950d5dc18253d63b561c2f7d6b96d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +runfiles/* \ No newline at end of file diff --git a/grompy/__init__.py b/grompy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f1617b917a74ae83dcc355d5341ccebd38e3d2c7 --- /dev/null +++ b/grompy/__init__.py @@ -0,0 +1 @@ +from .grompy import DataAccesProvider \ No newline at end of file diff --git a/grompy/dap.py b/grompy/dap.py new file mode 100644 index 0000000000000000000000000000000000000000..387c288fa336b43238cbcef913b92d3aa82b82da --- /dev/null +++ b/grompy/dap.py @@ -0,0 +1,46 @@ +import sqlalchemy as sa +import pandas as pd + + +class DataAccessProvider: + + def __init__(self, dsn=None, area_gt=None, pixcount_gt=None, provincie=None): + + self.engine = sa.create_engine(dsn) + meta = sa.MetaData(self.engine) + self.tbl_perc_info = sa.Table('perceels_info', meta, autoload=True) + self.s2_observations = sa.Table("s2_observations", meta, autoload=True) + + self.perc_stmt = sa.select([self.tbl_perc_info]) + if area_gt is not None: + self.perc_stmt.append_whereclause(self.tbl_perc_info.c.area == area_gt) + if pixcount_gt is not None: + self.perc_stmt.append_whereclause(self.tbl_perc_info.c.pixcount > pixcount_gt) + if provincie is not None: + self.perc_stmt.append_whereclause(self.tbl_perc_info.c.provincie == provincie) + + s = sa.select([sa.func.count()]).select_from(self.perc_stmt) + self.count = s.execute().fetchone()[0] + + def __iter__(self): + + r = self.perc_stmt.execute() + rows = r.fetchmany(100) + while rows: + for row in rows: + s = sa.select([self.s2_observations], + sa.and_(self.s2_observations.c.fieldID==row.fieldID), + order_by={self.s2_observations.c.day}) + df = pd.read_sql(s, self.engine) + df = df.drop(columns="fieldID") + df.index = pd.to_datetime(df.day) + yield row, df + rows = r.fetchmany() + + + + + + def __len__(self): + return self.count + diff --git a/grompy/sentinel2_observations_template.db3 b/grompy/sentinel2_observations_template.db3 new file mode 100644 index 0000000000000000000000000000000000000000..32e176f8f38b8c6db554196645325293f28a2ab8 Binary files /dev/null and b/grompy/sentinel2_observations_template.db3 differ diff --git a/perceels_info.sql b/perceels_info.sql new file mode 100644 index 0000000000000000000000000000000000000000..4fe367d0f46699a66b2c29877e11ea33bac0130c --- /dev/null +++ b/perceels_info.sql @@ -0,0 +1,21 @@ +CREATE TABLE perceels_info ( + fieldID BIGINT, + year INTEGER, + pixcount INTEGER, + area_ha FLOAT, + cat_gewasc TEXT, + gws_gewasc INTEGER, + gws_gewas TEXT, + provincie TEXT, + gemeente TEXT, + regio TEXT, + pc4 TEXT, + woonplaats TEXT, + waterschap TEXT, + primary key (fieldID) +); + +create index ix_pixcount on perceels_info(pixcount); +create index ix_gewas_area on perceels_info(gws_gewasc, area_ha); +create index ix_gewas_pixcount on perceels_info(gws_gewasc, pixcount); +create index ix_gewas_provincie on perceels_info(gws_gewasc, provincie); diff --git a/process_optisch.py b/process_optisch.py new file mode 100644 index 0000000000000000000000000000000000000000..383554e908a8a12ca1aba424ad32b0508248869b --- /dev/null +++ b/process_optisch.py @@ -0,0 +1,146 @@ +from csv import DictReader +from pathlib import Path +from shutil import copyfile +import time + +import pandas as pd +import sqlalchemy as sa + +cwd = Path(__file__).parent +csv_dir = cwd / "Optisch" +template_db = cwd / "sentinel2_observations_template.db3" +db_s2_observations = cwd / "sentinel2_observations_2019.db3" +csv_s2_observations = cwd / "sentinel2_observations_2019.csv" + +dummy_date = "19000101" + +mean_files = { + "NDVI": csv_dir / "zonal_stats_mean_2019_ADC.csv", + "B02": csv_dir / "zonal_stats_mean_B02_2019_ADC.csv", + "B03": csv_dir / "zonal_stats_mean_B03_2019_ADC.csv", + "B04": csv_dir / "zonal_stats_mean_B04_2019_ADC.csv", + "B05": csv_dir / "zonal_stats_mean_B05_2019_ADC.csv", + "B06": csv_dir / "zonal_stats_mean_B06_2019_ADC.csv", + "B07": csv_dir / "zonal_stats_mean_B07_2019_ADC.csv", + "B08": csv_dir / "zonal_stats_mean_B08_2019_ADC.csv", + "B11": csv_dir / "zonal_stats_mean_B11_2019_ADC.csv", + "B12": csv_dir / "zonal_stats_mean_B12_2019_ADC.csv", + "B8A": csv_dir / "zonal_stats_mean_B8A_2019_ADC.csv", + } + + +# Print iterations progress +def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"): + """ + Call in a loop to create terminal progress bar + @params: + iteration - Required : current iteration (Int) + total - Required : total iterations (Int) + prefix - Optional : prefix string (Str) + suffix - Optional : suffix string (Str) + decimals - Optional : positive number of decimals in percent complete (Int) + length - Optional : character length of bar (Int) + fill - Optional : bar fill character (Str) + printEnd - Optional : end character (e.g. "\r", "\r\n") (Str) + """ + percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) + filledLength = int(length * iteration // total) + bar = fill * filledLength + '-' * (length - filledLength) + print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = printEnd) + # Print New Line on Complete + if iteration == total: + print() + + +def take_first(iter): + for i in iter: + return i + + +def count_lines(files): + """Checks the number of lines in the input CSV files. + + They should all be the same else throw an error. + """ + print("Checking file row counts...") + counts = {} + for band, fname in files.items(): + with open(fname) as my_file: + c = sum(1 for _ in my_file) + counts[fname] = c + print(f" - {fname}: {c}") + + if len(set(counts.values())) > 1: + msg = "CSV files do not have the same number of rows!" + raise RuntimeError(msg) + + return take_first(counts.values()) + +def process_rows(rows): + df = pd.DataFrame() + fieldIDs = [] + for column_name, row in rows.items(): + fieldIDs.append(int(row.pop("field_ID"))) + count = row.pop("count") + recs = [] + for sdate, value in row.items(): + value = float(value) + if value == 0.: + continue + recs.append({"day": sdate, "value": float(value), "band": column_name}) + if not recs: # only zero (null) values for the column + # We add one dummy record to make sure we can create the dataframe properly + recs.append({"day": dummy_date, "value": None, "band": column_name}) + df_tmp = pd.DataFrame(recs) + try: + df_tmp["day"] = pd.to_datetime(df_tmp.day) + except: + pass + df = pd.concat([df, df_tmp]) + df = df.pivot(index="day", columns="band", values="value") + df.reset_index(inplace=True) + + if len(set(fieldIDs)) > 1: + msg = f"FieldIDs are not the same for this row: {fieldIDs}" + raise RuntimeError(msg) + df["fieldID"] = fieldIDs[0] + + ix = (df.day == pd.to_datetime(dummy_date)) + if any(ix): + df = df[~ix] + + return df + + +def write_to_SQLite(mean_csv_readers, nlines): + copyfile(template_db, db_s2_observations) + engine = sa.create_engine(f"sqlite:///{db_s2_observations}") + + printProgressBar(0, nlines-1, prefix='Progress:', suffix='Complete', length=50) + this_line = 0 + while True: + try: + rows = {column_name:next(reader) for column_name, reader in mean_csv_readers.items()} + df = process_rows(rows) + df.to_sql("s2_observations", engine, if_exists="append", index=False) + this_line += 1 + if this_line % 1000 == 0: + printProgressBar(this_line, nlines, prefix='Progress:', suffix='Complete', length=50) + except StopIteration: + break + + + +def main(): + + nlines = count_lines(mean_files) + mean_csv_readers = {} + for column_name, csv_fname in mean_files.items(): + mean_csv_readers[column_name] = DictReader(open(csv_fname)) + t1 = time.time() + write_to_SQLite(mean_csv_readers, nlines) + print(f"Processing {nlines} lines to SQLite took {time.time()-t1} seconds.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/process_percelen.py b/process_percelen.py new file mode 100644 index 0000000000000000000000000000000000000000..97f3aac236dbfbaece2ef1e300e4170bdda75bfa --- /dev/null +++ b/process_percelen.py @@ -0,0 +1,44 @@ +from pathlib import Path + +import geopandas as gpd +import pandas as pd +import sqlalchemy as sa +import numpy as np + + +def main(): + fname_percelen = Path.cwd() / "BRP" / "gewaspercelen_2019.shp" + # fname_percelen = Path.cwd() / "BRP" / "BRP_10rows.shp" + df = gpd.read_file(fname_percelen) + df["area_ha"] = df.geometry.area/1e4 + df = df.set_index("fieldid") + + fname_counts = Path.cwd() / "Optisch" / "perceelscount.csv" + df_counts = pd.read_csv(fname_counts) + df_counts.set_index("field_ID", inplace=True) + + df["pixcount"] = df_counts.pixcount + + df_out = pd.DataFrame({"fieldID": df.index, + "year": df.year, + "pixcount": df.pixcount, + "area_ha": df.area_ha, + "cat_gewasc": df.cat_gewasc.apply(str), + "gws_gewasc": df.gws_gewasc.astype(np.int32), + "gws_gewas": df.gws_gewas.apply(str), + "provincie": df.provincie.apply(str), + "gemeente": df.gemeente.apply(str), + "regio": df.regio.apply(str), + "pc4": df.PC4.apply(str), + "woonplaats": df.woonplaats.apply(str), + "waterschap": df.waterschap.apply(str), + }) + + fname_percelen_db = Path.cwd() / "sentinel2_observations_2019.db3" + dsn = f"sqlite:///{fname_percelen_db}" + engine = sa.create_engine(dsn) + df_out.to_sql("perceels_info", engine, if_exists="append", index=False) + + +if __name__ == "__main__": + main()