From a6be5c167f0dd02abd8885b1c40e5c90b35f4c7d Mon Sep 17 00:00:00 2001
From: Adriaens <ines.adriaens@wur.nl>
Date: Wed, 7 Dec 2022 14:02:03 +0100
Subject: [PATCH] finish dataset preparation HD sheep

---
 .../harmen/preprocess_sheepdata.py            | 62 ++++++++++++++++---
 1 file changed, 53 insertions(+), 9 deletions(-)

diff --git a/datapreparation/harmen/preprocess_sheepdata.py b/datapreparation/harmen/preprocess_sheepdata.py
index 9ec4541..c114153 100644
--- a/datapreparation/harmen/preprocess_sheepdata.py
+++ b/datapreparation/harmen/preprocess_sheepdata.py
@@ -35,31 +35,75 @@ for f in os.listdir(path):
     new = new.drop(new.index.values[-1],axis=0)
     new["t"] = new.index.values
     new["at"] = pd.to_datetime(new["datetime"],format = "%d-%m-%Y %H:%M:%S.%f")
+    """
+    28/10/2022 : start recording 16:30
+    04/11/2022 : remove data between 13:00 and 14:00 - keep only data from 1 accelerometer
+    11/11/2022 : remove data between 14:00 and 15:00 - keep only data from 1 accelerometer
+    """
+    if "wk1_20221028" in f:
+        new = new.loc[new["at"].dt.hour >= 17,:].reset_index(drop=1)
+    elif "wk1_20221104" in f:
+        new = new.loc[new["at"].dt.hour < 13,:].reset_index(drop=1)
+        # fill gap with nan data
+        idx = np.linspace(new.index.values[-1]+1,new.index.values[-1]+60*25*60,new.index.values[-1]+60*25*60-new.index.values[-1]).astype(int)
+        refdate = pd.to_datetime("2022/11/04 13:00:00.000",format = "%Y/%m/%d %H:%M:%S.%f")
+        df = pd.DataFrame([],index=idx,columns=new.columns)
+        df["at"] = refdate + np.linspace(0,len(idx)-1,len(idx))* pd.Timedelta(40,"milli")
+        df["t"] = idx
+        df["datetime"] = df["at"].dt.strftime("%d-%m-%Y %H:%M:%S.%f")
+        # add "gap" to new
+        new = pd.concat([new,df],axis = 0)
+    elif "wk2_20221104" in f:
+        new = new.loc[new["at"].dt.hour >= 14,:].reset_index(drop=1)
+    elif "wk2_20221111" in f:
+        new = new.loc[new["at"].dt.hour < 14,:].reset_index(drop=1)
+        # fill gap with nan data
+        idx = np.linspace(new.index.values[-1]+1,new.index.values[-1]+60*25*60*2,new.index.values[-1]+60*25*60*2-new.index.values[-1]).astype(int)
+        refdate = pd.to_datetime("2022/11/11 14:00:00.000",format = "%Y/%m/%d %H:%M:%S.%f")
+        df = pd.DataFrame([],index=idx,columns=new.columns)
+        df["at"] = refdate + np.linspace(0,len(idx)-1,len(idx))* pd.Timedelta(40,"milli")
+        df["t"] = idx
+        df["datetime"] = df["at"].dt.strftime("%d-%m-%Y %H:%M:%S.%f")
+        # add "gap" to new
+        new = pd.concat([new,df],axis = 0)
+    elif "wk3_20221111" in f:
+        new = new.loc[new["at"].dt.hour >= 16,:].reset_index(drop=1)
+    elif "wk3_20221124" in f:
+        new = new.loc[new["at"].dt.hour < 10,:].reset_index(drop=1)
+    # preprocess
     new["win"] = np.floor(new["t"]/25)
-    test2 = new["win"].drop_duplicates()
-    test3 = new.iloc[test2.index.values,:]
-    test3.index = test3.win
-    accum = new[["win","acc_x","acc_y","acc_z"]].groupby(by="win").mean().reset_index()
-    test = accum.join(test3[["at","win"]], on= "win", rsuffix = "_m")
-    test = test.drop(columns = "win_m")
+    # prepare frame with datetimes aggregated to one second
+    test2 = new["win"].drop_duplicates() # keep first
+    test3 = new.iloc[test2.index.values,:]  # select data of first
+    test3.index = test3.win    # change index to win
+    accum = new[["win","acc_x","acc_y","acc_z"]].groupby(by="win").mean().reset_index()  # calculate mean activity
+    test = accum.join(test3[["at","win"]], on= "win", rsuffix = "_m")  # join activity with date in at
+    test = test.drop(columns = "win_m") 
     data = pd.concat([data,test])
 
-
+# sort data based on date + add id
 data = data.sort_values(by = "at").reset_index(drop=1)
 data["id"] = 1
+
+# smooth with rolling median
 data["acc_xm"] = data["acc_x"].rolling(60).median() 
 data["acc_ym"] = data["acc_y"].rolling(60).median() 
 data["acc_zm"] = data["acc_z"].rolling(60).median() 
 
-
+# plot and save
 data["day"] = data["at"].dt.day
+data["month"] = data["at"].dt.month
 days = data["day"].drop_duplicates()
 for day in days:
     print(day)
-    fn = "sheep1" + "_day" + str(day) + ".png"
+    month = data.loc[data["day"]==day,"at"].dt.month.drop_duplicates().reset_index(drop=1)
+    fn = "sheep1" + "_2022" + str(month[0]) + str(day) + ".png"
     fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,10))
     ax.plot(data.loc[data["day"]==day,"at"],data.loc[data["day"]==day,["acc_x","acc_y","acc_z"]])
     ax.plot(data.loc[data["day"]==day,"at"],data.loc[data["day"]==day,["acc_xm","acc_ym","acc_zm"]],
             color = "k", linewidth = 0.5)
+    ax.set_title("sheep 1 - " + str(day) + "/" + str(month[0]) + '/2022')
+    ax.set_xlabel("time")
+    ax.set_ylabel("acceleration in m/s²")
     plt.savefig(svpath + "\\" + fn)
     plt.close()
-- 
GitLab