Merge branch 'data' into 'main'

Data See merge request !3

Merge branch 'data' into 'main'
61a02b7c · Adriaens, Ines · a58cfd59 · 3e5c661b · 61a02b7c · 61a02b7c
Commit 61a02b7c authored 2 years ago by Adriaens, Ines
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+**date**: November 21, 2022
+**author**: Ines Adriaens
+
+## Overview tools for changepoint analysis
+
+**Ruptures**
+------------
+
+[documentation](https://centre-borelli.github.io/ruptures-docs/)  
+[user guide](https://centre-borelli.github.io/ruptures-docs/user-guide/)  
+[code](https://github.com/deepcharles/ruptures)  
+[manuscript](http://www.laurentoudre.fr/publis/TOG-SP-19.pdf)
+
+- exact and approximate changepoint detection for segmentation of non-stationary signals;
+- PELT available (Pruned Exact Linear Time) 
+- different cost functions (mean, median, mean/cov, linear, etc) available
+- not too well documented / explained
+- works with multivariate data, but not well documented
+- still active
+
+
+**sdt-python**
+--------------
+
+[documentation](https://schuetzgroup.github.io/sdt-python/index.html)  
+[code](https://github.com/schuetzgroup/sdt-python)  
+[zenodo, doi](https://zenodo.org/record/7078957#.Y3uN1XbMKUk)
+
+- developed for processing of fluorescence microscopic images
+- more restricted use for 
+- PELT available, also Bayesian offline and online methods
+- last update 2020
+
+
+
+**Changepy**
+------------
+
+[code](https://github.com/ruipgil/changepy)  
+[R package](https://github.com/rkillick/changepoint/)
+
+- PELT alone implemented
+- last update in 2017
+- gateway for the R package rKillick cpt
+- restricted number of cost functions available, incl. mean, var, meanvar, poisson and exponential
+
+
+--------------------------------------------------------------
+
+### Choice
+
+**ruptures**
\ No newline at end of file
--- a/datapreparation/ines/datapreparation.py
+++ b/datapreparation/ines/datapreparation.py
@@ -56,16 +56,16 @@ path_sql = os.path.join(

 # set path to results folder
 path_res = os.path.join("C:","/Users","adria036","OneDrive - Wageningen University & Research",
-                        "iAdriaens_doc","Projects","iAdriaens","bait","scripts",
-                        "bait","data","nlas_ines/"
+                        "iAdriaens_doc","Projects","iAdriaens","bait","data",
+                        "nlas_ines/"
                        )

 # set file name for cowid information
 fn = "\Copy_of_Sewio_tags.xlsx"

 # set start and end dates
-startdate = date(2022,7,16)
-enddate = date(2022,7,16)
+startdate = date(2022,7,1)
+enddate = date(2022,7,10)

 # set parameters for the imputation and interpolation
 win_med = 31
@@ -120,36 +120,44 @@ for dd in range(t_interval.days+1):
        df = pd.concat([cowids,nsec],axis = 1).drop_duplicates()  # gives the indices of unique first 
        del df["relsec"], df["cowid"]
        
+        # gap in acceleration data
+        data["gap"] = data["at"].diff()
+        test = data["gap"]*1000
+        data["gap"] = test.dt.seconds/1000
+        del test
+        data.loc[data["gap"] < 0,"gap"] = np.nan # set to nan when different cowid
+        
        # innerjoin for selection step - unique seconds in the dataset
-        data = pd.concat([data,df],axis = 1, join = 'inner')
-        data = data.sort_values(by = ["cowid","at"]).reset_index(drop=1)
+        data_pos = pd.concat([data,df],axis = 1, join = 'inner')
+        data_pos["OID"] = data_pos.index.values
+        data_pos = data_pos.sort_values(by = ["cowid","at"]).reset_index(drop=1)
        
        # unfiltered data - one per second or less, all cows included  
        # calculate gaps based on numeric time relative to start of dataset
-        data["gap"] = data["relsec"].diff()
-        data.loc[data["gap"] < 0,"gap"] = np.nan # set to nan when different cowid
+        data_pos["gap"] = data_pos["relsec"].diff()
+        data_pos.loc[data["gap"] < 0,"gap"] = np.nan # set to nan when different cowid

        #--------------------------edit barn edges---------------------------------
        # set data outside edges to edges x < 0 and < 10.6 or x < 21 and x > 32
-        data.loc[(data["X"] < 15),"X"] = filter_barn_edges( \
-                 copy.copy(data.loc[(data["X"] < 15),"X"]),0,10.6) #barn 72
-        data.loc[(data["X"] >= 15),"X"] = filter_barn_edges( \
-                  copy.copy(data.loc[(data["X"] >= 15),"X"]),21,32) # barn 70
+        data_pos.loc[(data_pos["X"] < 15),"X"] = filter_barn_edges( \
+                 copy.copy(data_pos.loc[(data_pos["X"] < 15),"X"]),0,10.6) #barn 72
+        data_pos.loc[(data_pos["X"] >= 15),"X"] = filter_barn_edges( \
+                  copy.copy(data_pos.loc[(data_pos["X"] >= 15),"X"]),21,32) # barn 70

        # set data outside edges to edges y < -18 or y > -2.5
-        data["y"] = filter_barn_edges(copy.copy(data["y"]),-18,-2.5)
+        data_pos["y"] = filter_barn_edges(copy.copy(data["y"]),-18,-2.5)

        #data["X"].hist()
        #data["y"].hist()
        
        # delete rows without cowid
-        data = data.loc[data["cowid"]>0,:]
+        data_pos = data_pos.loc[data["cowid"]>0,:]

        #-----------------------filter & interpolate per cow-----------------------
        # filter the (X,y) time series with a median filter and interpolate
        
        # cows in the barn
-        cows = data["cowid"].drop_duplicates().sort_values().reset_index(drop=1)
+        cows = data_pos["cowid"].drop_duplicates().sort_values().reset_index(drop=1)
        cows = pd.DataFrame(cows)
        
        # loop over all data per cowid and select (x,y,t) data to filter
@@ -164,9 +172,10 @@ for dd in range(t_interval.days+1):
                               "area" : []})
        for i in range(0,len(cows)):
            # select data
-            cow_t = data.loc[(data["cowid"] == cows["cowid"][i]),"relsec"]
-            cow_x = data.loc[(data["cowid"] == cows["cowid"][i]),"X"]
-            cow_y = data.loc[(data["cowid"] == cows["cowid"][i]),"y"]
+            cow_OID = data_pos.loc[(data_pos["cowid"] == cows["cowid"][i]),["OID","relsec"]]
+            cow_t = data_pos.loc[(data_pos["cowid"] == cows["cowid"][i]),"relsec"]
+            cow_x = data_pos.loc[(data_pos["cowid"] == cows["cowid"][i]),"X"]
+            cow_y = data_pos.loc[(data_pos["cowid"] == cows["cowid"][i]),"y"]
            
            # filter data
            if len(cow_t) > 10:
@@ -181,8 +190,7 @@ for dd in range(t_interval.days+1):
                    cow_t = pd.concat([pd.Series(cow_t.iloc[0]//86400*86400),cow_t], axis=0)
                    cow_x = pd.concat([pd.Series(np.nan),cow_x], axis=0)
                    cow_y = pd.concat([pd.Series(np.nan),cow_y], axis=0)
-                
-                # TODO: add accelerations
+
                
                # filter
                df,x,y = filter_interp(cow_t,cow_x,cow_y,win_med,gap_thres)
@@ -199,6 +207,9 @@ for dd in range(t_interval.days+1):
                df["cowid"] = cows["cowid"][i]
                df["date"] = str(startdate+dd)
                df = df[["cowid","date","t","gap","X","y","xnew","ynew","area"]]
+                df["t"] = df["t"].astype(int)
+                df = pd.merge(df,cow_OID,how="outer",left_on = "t",right_on = "relsec")
+                df.loc[df["xnew"].isna(),"area"] = np.nan
            else:
                df = pd.DataFrame({"cowid":cows["cowid"][i],
                               "date":str(startdate+dd),
@@ -208,15 +219,24 @@ for dd in range(t_interval.days+1):
                               "y" :  pd.Series(np.arange(0,86400,1)*np.nan),
                               "xnew" : pd.Series(np.arange(0,86400,1)*np.nan),
                               "ynew" : pd.Series(np.arange(0,86400,1)*np.nan),
-                               "area" : pd.Series(np.arange(0,86400,1)*np.nan)})
+                               "area" : pd.Series(np.arange(0,86400,1)*np.nan),
+                               "relsec" : pd.Series(np.arange(0,86400,1)*np.nan),
+                               "OID" : pd.Series(np.arange(0,86400,1)*np.nan)})
                
            # concat dataframes all cows together 
            result = pd.concat([result,df])
+            
+        # concatenate result with the accelation data on "OID"
+        data["OID"] = data.index.values
+        result = pd.merge(data,result[["t","xnew","ynew","area","relsec","OID"]],
+                          how="outer",
+                          left_on = ["OID","relsec"],
+                          right_on = ["OID","relsec"])

        # save results
        datestr = str(startdate+dd)
        datestr = datestr.replace("-","")    
-        fns =  datestr + '_bec3.txt'
+        fns =  datestr + '_bait.txt'
        result.to_csv(path_res + fns, index = False)
        
        # clear workspace and memory

--- a/datapreparation/ines/preprocessing.py
+++ b/datapreparation/ines/preprocessing.py
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Nov 15 09:06:43 2022
+
+@author: adria036
+"""
+
+
+import os
+os.chdir(r"C:\Users\adria036\OneDrive - Wageningen University & Research\iAdriaens_doc\Projects\iAdriaens\bait\scripts\bait\datapreparation\ines") 
+
+
+#%% import packages
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+# define functions
+
+def accel_act(df,window):
+    """
+    combine the accelarations in the x,y and z dimension into a general activity
+    paramter.
+    Step 1 : rescale values in each direction
+    Step 2 : set level of activtiy based on amplitude in preset window
+    Step 3 : combine activity level in 3 directions
+    
+    df: Pandas DataFrame with columns 'acc_x', 'acc_y', 'acc_z','id', 'at'
+    window: 
+    
+    """
+    svpath = os.path.join("C:","/Users","adria036",
+                        "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                        "Projects","iAdriaens","bait","results")
+    
+    try:
+        df = df.rename(columns = {"cowid":"id"})
+        namechange = 1
+    except:
+        namechange = 0
+        
+    # calculate activity
+    ids = df["id"].drop_duplicates().reset_index(drop=1)
+    output = pd.DataFrame([])
+    for subj in ids:
+        print("Calculating activity for subj = " + str(int(subj)))
+        sset = df.loc[(df["id"]==subj) & \
+                      (~df["acc_x"].isna()) & \
+                      (~df["acc_y"].isna()) & \
+                      (~df["acc_z"].isna()),:].copy()
+        
+        # standardise and scale measurements
+        sset["acc_xs"] = (sset["acc_x"]-sset["acc_x"].min())/(sset["acc_x"].max()-sset["acc_x"].min())
+        sset["acc_ys"] = (sset["acc_y"]-sset["acc_y"].min())/(sset["acc_y"].max()-sset["acc_y"].min())
+        sset["acc_zs"] = (sset["acc_z"]-sset["acc_z"].min())/(sset["acc_z"].max()-sset["acc_z"].min())
+        
+        # set datetime to days, seconds, microseconds
+        sset["d"] = sset["at"].dt.day-sset["at"].dt.day.min()
+        sset["s"] = sset["at"].dt.hour*3600 \
+                         + sset["at"].dt.minute*60 \
+                         + sset["at"].dt.second
+        # sset["ms"] = sset["at"].dt.microsecond/100
+        
+        # accumulated activity calculated in window: set window index
+        sset["win"] = np.floor(sset["s"]/window)
+        
+        # accumulate activity in set windowsize - operation = std
+        act = sset[["d","win","acc_xs","acc_ys","acc_zs"]].groupby(by=["d","win"]).std()
+        act["acc_xs"] = act["acc_xs"].fillna(act["acc_xs"].mean()) # if only 1 measurement std= nan
+        act["acc_ys"] = act["acc_ys"].fillna(act["acc_ys"].mean()) # replace by mean std
+        act["acc_zs"] = act["acc_zs"].fillna(act["acc_zs"].mean())
+        act["activity"] = (act["acc_xs"]+act["acc_ys"]+act["acc_zs"])/3
+        act = act.reset_index()
+        sset = pd.merge(sset,act[["d","win","activity"]],on = ["d","win"])
+        
+        
+        # plot activity, standardized activity and time series of result
+        # fig,ax = plt.subplots(nrows=1,ncols=3,figsize = (20,10))
+        # ax[0].hist(sset["acc_x"],density=True,color = "g")
+        # ax[1].hist(sset["acc_y"],density=True,color = "g")
+        # ax[2].hist(sset["acc_z"],density=True,color = "g")
+        # fig,ax = plt.subplots(nrows=1,ncols=3,figsize = (20,10))
+        # ax[0].hist(sset["acc_xs"],density=True,color = "g")
+        # ax[1].hist(sset["acc_ys"],density=True,color = "g")
+        # ax[2].hist(sset["acc_zs"],density=True,color = "g")
+        fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,10))
+        T = 0
+        for d in act["d"].drop_duplicates():
+            ax.plot(act.loc[act["d"]==d,"win"],act.loc[act["d"]==d,"activity"]+T)
+            T+=0.5
+        ax.set_title("activity for subject " + str(int(subj)))
+        ax.set_xlabel("time of the day, step = " + str(window) + "s")
+        ax.set_ylabel("activity per day")
+        plt.savefig(svpath + "\\activity_" + str(int(subj))+ ".tif")
+        
+        # combine data from all cows
+        output = pd.concat([output,sset],axis = 0)
+        
+        # change name back to initial name
+        if namechange == 1:
+            output = output.rename(columns = {"id":"cowid"})
+            
+    return output 
+  
+
+
+#%% prepare dataset 1: accelerations (x,y,z) DC cows
+path = os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","data","nlas_ines")
+svpath = os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","results")
+fn = "\\acceleration_dairy_campus.tif"
+
+
+# read data
+data = pd.DataFrame([])
+for f in os.listdir(path) :
+    if ("bait" in f):
+        print(f)
+        
+        new = pd.read_csv(path+"\\"+f,usecols=[0,4,7,8,9,10,11,14,16,18,19,20])
+        new["at"] = pd.to_datetime(new["at"],format = "%Y-%m-%d %H:%M:%S.%f%z")
+        
+        # select animals list of cows with single sensor in barn 72 (no replacements)
+        cows = pd.DataFrame([3152,1681,821,3106,419,3089,7387,1091,605],columns = ["cowid"])
+        new = pd.merge(new,cows,how="inner")
+        
+        # add to previous dataset
+        data = pd.concat([data,new],axis=0)
+    
+del new, path, cows, f
+
+# visualisation of data: acceleration in x,y,z
+plt.rc('font', size=14)
+fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,18))
+T = 0
+yt = [T]
+for ID in data["cowid"].drop_duplicates():
+    ax.plot(data.loc[data["cowid"]==ID,"t"],data.loc[data["cowid"]==ID,"acc_x"]+T,linewidth=0.25,color="steelblue")
+    ax.plot(data.loc[data["cowid"]==ID,"t"],data.loc[data["cowid"]==ID,"acc_y"]+T,linewidth=0.25,color="teal")
+    ax.plot(data.loc[data["cowid"]==ID,"t"],data.loc[data["cowid"]==ID,"acc_z"]+T,linewidth=0.25,color="limegreen")
+    ax.plot(data.loc[data["cowid"]==ID,"t"],T*np.ones(len(data.loc[data["cowid"]==ID,"t"])),linewidth=0.25,color="k")
+    T+=data[["acc_x","acc_y","acc_z"]].max().max()+1
+    yt.append(round(T))
+ax.set_title("activity [m/s²]")
+ax.set_xlabel("t [s]")
+ax.set_ylabel("acceleration per cowid")
+ax.legend(["acc_x","acc_y","acc_z"])
+IDS = [str(ID) for ID in data["cowid"].drop_duplicates().astype(int).to_list()]
+ax.set_yticks(yt[:-1],labels=IDS)
+plt.savefig(svpath + fn)
+
+del svpath, fn, fig, ax
+
+# calculate general activity level
+
+df = data[["cowid","acc_x","acc_y","acc_z","at"]]
+window = 60  # in seconds
+df2 = accel_act(df,window)
+
+# add activity to data
+DCdata = pd.merge(data,df2[["cowid","at","window","activity"]],on = ["cowid","at"],how="outer")
+del window, df, df2, data
+
+# save to csv
+path = os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","data","preprocessed")
+DCdata.to_csv(path+"\\DCdata.txt")
+del path
+
+# reformat as preferred for segmentation module (strip)
+
+
+
+
+#%% prepare dataset 2: accelerations (x,y,z) 
+
+path = os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","data","nlas_harmen")
+svpath = os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","results")
+fn = "\\acceleration_harmen.tif"
+refdate = pd.to_datetime("today")
+
+# read data
+data = pd.DataFrame([])
+ID = 1
+for f in os.listdir(path):
+    print(f)
+    
+    # read data and reformat
+    new = pd.read_csv(path+"\\"+f,header = None)
+    new.columns = ["acc_x","acc_y","acc_z"]
+    new["id"] = ID
+    ID += 1
+    new["t"] = range(0,len(new))
+    ts = np.linspace(0,len(new),len(new))* pd.Timedelta(40,"milli")
+    new["at"] = refdate + ts
+    new = new[["id","at","t","acc_x","acc_y","acc_z"]]
+    
+    # add to previous dataset
+    data = pd.concat([data,new],axis=0)
+    
+del f, new, ID, ts, refdate, path
+
+# visualise data - acceleration in x,y,z
+fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,10))
+T = 0
+for ID in data["id"].drop_duplicates():
+    ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_x"]+T,linewidth=0.25,color="steelblue")
+    ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_y"]+T,linewidth=0.25,color="teal")
+    ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_z"]+T,linewidth=0.25,color="limegreen")
+    T+=data[["acc_x","acc_y","acc_z"]].max().max()+1
+ax.set_title("activity")
+ax.set_xlabel("t")
+ax.set_ylabel("acceleration")
+ax.legend(["acc_x","acc_y","acc_z"])
+plt.savefig(svpath + fn)
+
+del T, fig,ax
+
+# calculate general activity level
+df = data[["id","acc_x","acc_y","acc_z","at"]]
+window = 60  # in seconds
+df2 = accel_act(df,window)
+
+# add activity to data
+HDdata = pd.merge(data,df2[["id","at","window","activity"]],on = ["id","at"],how="outer")
+
+# clear workspace
+del window, df, df2, data
+del svpath, fn
+
+# save to csv
+path = os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","data","preprocessed")
+HDdata.to_csv(path+"\\HDdata.txt")
+del path
+
+#%% reformat as preferred for segmentation module (strip)
\ No newline at end of file
--- a/datapreparation/ines/read_data_sewio.py
+++ b/datapreparation/ines/read_data_sewio.py
@@ -446,8 +446,9 @@ def read_ids(path, fn):
    # fn = "\Sewio_tags.xlsx"

    # read tagIDs and tags2cow from excel file
-    tag2cow = pd.read_excel(path+fn, sheet_name = "Tag2Cow")
-    tagIDs = pd.read_excel(path+fn, sheet_name = "Tag_IDs")
+    tag2cow = pd.read_excel(path+fn, sheet_name = "Tag2Cow", skiprows = [0,1])
+    tagIDs = pd.read_excel(path+fn, sheet_name = "Tag_IDs", skiprows = [0,1,2,3])
+    tagIDs = tagIDs.rename(columns = {'Sewio_alias' : 'Sewio_tagnr'})
    # combine tag with tag id and inspect
    tags = tag2cow.merge(tagIDs)
    tags.head(20)

--- a/datapreparation/ines/temp1.py
+++ b/datapreparation/ines/temp1.py
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Nov 15 09:06:43 2022
+
+@author: adria036
+"""
+
+
+import os
+os.chdir(r"C:\Users\adria036\OneDrive - Wageningen University & Research\iAdriaens_doc\Projects\iAdriaens\bait\scripts\bait\datapreparation\ines") 
+
+
+#%% import packages
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from datetime import date,timedelta
+
+# define functions
+
+def accel_act(df,window):
+    """
+    combine the accelarations in the x,y and z dimension into a general activity
+    paramter.
+    Step 1 : rescale values in each direction
+    Step 2 : set level of activtiy based on amplitude in preset window
+    Step 3 : combine activity level in 3 directions
+    
+    df: Pandas DataFrame with columns 'acc_x', 'acc_y', 'acc_z','id', 'at'
+    window: 
+    
+    """
+    try:
+        df = df.rename(columns = {"cowid":"id"})
+        namechange = 1
+    except:
+        namechange = 0
+        
+    # calculate activity
+    ids = df["id"].drop_duplicates().reset_index(drop=1)
+    output = pd.DataFrame([])
+    for subj in ids:
+        print("Calculating activity for subj = " + str(int(subj)))
+        sset = df.loc[(df["id"]==subj) & \
+                      (~df["acc_x"].isna()) & \
+                      (~df["acc_y"].isna()) & \
+                      (~df["acc_z"].isna()),:].copy()
+        
+        # standardise and scale measurements
+        sset["acc_xs"] = (sset["acc_x"]-sset["acc_x"].min())/(sset["acc_x"].max()-sset["acc_x"].min())
+        sset["acc_ys"] = (sset["acc_y"]-sset["acc_y"].min())/(sset["acc_y"].max()-sset["acc_y"].min())
+        sset["acc_zs"] = (sset["acc_z"]-sset["acc_z"].min())/(sset["acc_z"].max()-sset["acc_z"].min())
+        
+        # set datetime to days, seconds, microseconds
+        sset["d"] = sset["at"].dt.day-sset["at"].dt.day.min()
+        sset["s"] = sset["at"].dt.hour*3600 \
+                         + sset["at"].dt.minute*60 \
+                         + sset["at"].dt.second
+        # sset["ms"] = sset["at"].dt.microsecond/100
+        
+        # accumulated activity calculated in window: set window index
+        sset["win"] = np.floor(sset["s"]/window)
+        
+        # accumulate activity in set windowsize - operation = std
+        act = sset[["d","win","acc_xs","acc_ys","acc_zs"]].groupby(by=["d","win"]).std()
+        act["acc_xs"] = act["acc_xs"].fillna(act["acc_xs"].mean()) # if only 1 measurement std= nan
+        act["acc_ys"] = act["acc_ys"].fillna(act["acc_ys"].mean()) # replace by mean std
+        act["acc_zs"] = act["acc_zs"].fillna(act["acc_zs"].mean())
+        act["activity"] = (act["acc_xs"]+act["acc_ys"]+act["acc_zs"])/3
+        act = act.reset_index()
+        sset = pd.merge(sset,act[["d","win","activity"]],on = ["d","win"])
+        
+        
+        # plot activity, standardized activity and time series of result
+        fig,ax = plt.subplots(nrows=1,ncols=3,figsize = (20,10))
+        ax[0].hist(sset["acc_x"],density=True,color = "g")
+        ax[1].hist(sset["acc_y"],density=True,color = "g")
+        ax[2].hist(sset["acc_z"],density=True,color = "g")
+        fig,ax = plt.subplots(nrows=1,ncols=3,figsize = (20,10))
+        ax[0].hist(sset["acc_xs"],density=True,color = "g")
+        ax[1].hist(sset["acc_ys"],density=True,color = "g")
+        ax[2].hist(sset["acc_zs"],density=True,color = "g")
+        fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,10))
+        T = 0
+        for d in act["d"].drop_duplicates():
+            ax.plot(act.loc[act["d"]==d,"win"],act.loc[act["d"]==d,"activity"]+T)
+            T+=0.5
+        ax.set_title("activity for subject " + str(subj))
+        ax.set_xlabel("time of the day, window = " + str(window) + "s")
+        ax.set_ylabel("activity")
+        
+        # combine data from all cows
+        output = pd.concat([output,sset],axis = 0)
+        
+        # change name back to initial name
+        if namechange == 0:
+            output = output.rename(columns = {"id":"cowid"})
+            
+    return output 
+  
+
+
+#%% prepare dataset 1: accelerations (x,y,z) DC cows
+path = os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","data","nlas_ines")
+
+# read data
+data = pd.DataFrame([])
+for f in os.listdir(path) :
+    if ("bait" in f):
+        print(f)
+        
+        new = pd.read_csv(path+"\\"+f,usecols=[0,4,7,8,9,10,11,14,16,18,19,20])
+        new["at"] = pd.to_datetime(new["at"],format = "%Y-%m-%d %H:%M:%S.%f%z")
+        
+        # select animals list of cows with single sensor in barn 72 (no replacements)
+        cows = pd.DataFrame([3152,1681,821,3106,419,3089,7387,1091,605],columns = ["cowid"])
+        new = pd.merge(new,cows,how="inner")
+        
+        # add to previous dataset
+        data = pd.concat([data,new],axis=0)
+    
+del new, path, cows, f
+
+# visualisation of data: acceleration in x,y,z
+fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,18))
+T = 0
+for ID in data["cowid"].drop_duplicates():
+    ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_x"]+T,linewidth=0.25,color="steelblue")
+    ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_y"]+T,linewidth=0.25,color="teal")
+    ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_z"]+T,linewidth=0.25,color="limegreen")
+    T+=data[["acc_x","acc_y","acc_z"]].max().max()+1
+ax.set_title("activity")
+ax.set_xlabel("t")
+ax.set_ylabel("acceleration")
+ax.legend(["acc_x","acc_y","acc_z"])
+
+# calculate general activity level
+df = data[["cowid","acc_x","acc_y","acc_z","at"]]
+window = 60  # in seconds
+df2 = accel_act(df,window)
+
+# add activity to data
+data = pd.merge(data,df2[["cowid","at","activity"]],on = ["cowid","at"],how="outer")
+del window, df, df2
+
+
+# reformat as preferred for segmentation module (strip)
+
+
+
+
+#%% prepare dataset 2: accelerations (x,y,z) 
+
+path = os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","data","nlas_harmen")
+
+refdate = pd.to_datetime("today")
+
+# read data
+data = pd.DataFrame([])
+ID = 1
+for f in os.listdir(path):
+    print(f)
+    
+    # read data and reformat
+    new = pd.read_csv(path+"\\"+f,header = None)
+    new.columns = ["acc_x","acc_y","acc_z"]
+    new["id"] = ID
+    ID += 1
+    new["t"] = range(0,len(new))
+    ts = np.linspace(0,len(new),len(new))* pd.Timedelta(40,"milli")
+    new["at"] = refdate + ts
+    new = new[["id","at","t","acc_x","acc_y","acc_z"]]
+    
+    # add to previous dataset
+    data = pd.concat([data,new],axis=0)
+    
+del f, new, path, ID, ts, refdate
+
+# visualise data - acceleration in x,y,z
+fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,10))
+T = 0
+for ID in data["id"].drop_duplicates():
+    ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_x"]+T,linewidth=0.25,color="steelblue")
+    ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_y"]+T,linewidth=0.25,color="teal")
+    ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_z"]+T,linewidth=0.25,color="limegreen")
+    T+=data[["acc_x","acc_y","acc_z"]].max().max()+1
+ax.set_title("activity")
+ax.set_xlabel("t")
+ax.set_ylabel("acceleration")
+ax.legend(["acc_x","acc_y","acc_z"])
+del T, fig,ax
+
+# calculate general activity level
+df = data[["id","acc_x","acc_y","acc_z","at"]]
+window = 60  # in seconds
+df2 = accel_act(df,window)
+
+# add activity to data
+data = pd.merge(data,df2[["id","at","activity"]],on = ["id","at"],how="outer")
+
+# clear workspace
+del window, df, df2
+
+
+
+
+
+#%% reformat as preferred for segmentation module (strip)
\ No newline at end of file
--- a/tool/requirements.txt
+++ b/tool/requirements.txt
 # packages to be added
+ruptures=1.1.7
+pandas=1.4.3
+matplotlib=3.5.2
+numpy=1.22.3
+openpyxl=3.0.10
+python-snappy=0.6.0
+spyder=5.1.5
\ No newline at end of file
--- a/tool/segmentation.py
+++ b/tool/segmentation.py
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Nov 22 13:56:40 2022
+
+@author: adria036
+
+-------------------------------------------------------------------------------
+Segmentation module of BAIT
+- uses ruputures package;
+- aims at segmenting one or multiple time series of (sensor) data in different 
+  parts with different statistiacal characteristics. the features of this 
+  segment can then be used to classify the a period of time into different 
+  behaviours;
+- depending on the behaviour of interest, a cost function of the segmentation 
+  is chosen; based on this choice, segmentation of the TS is done based on, for
+  example, the average, standard deviation, linear path, etc. of each segment;
+- the input can be a single ts of measurements, or it can be combined into a 
+  cost based on several ts together (e.g. x, y, z), potentially with a different
+  weight;
+- we recommend to use PELT (Pruned Linear Exact Time) as the search algorithm
+  for the change points. PELT has a linear computational cost O(t), i.e., linear
+  to the amount of measurements in the time series
+- because of the cost of a search algorithm, larger series will have cost more
+  time to segment;
+- typically the exact amount of changes remains unknown, but can be guestimated 
+  with the knowledge on the specific target behaviours. It is recommended to 
+  estimate the amount of changes rather broadly, as this allows the classifier
+  to decide for each segment whether or not the behaviour has changed;
+- alternatively, a penalty function for adding a changepoint can be applied to 
+  determine the amount of changes in the ts. However, when the variance across 
+  behaviours is different, it typically is hard to rely on such penalty, as 
+  the cost function minimization will optimize to changepoints in the area of 
+  the largest variability;
+- application of a minimal distance threshold between two changepoints can also
+  be interesting; knowing that typically a 
+- standardisation and application of a smoothing can always be of interest to
+  mathematically optimize the search and avoid bias / local convergence caused
+  by heteroscedasticity in the data
+-------------------------------------------------------------------------------
+General (mathematical) remarks on changepoint analysis
+- commonly used is maximum likelihood estimation for change detection, in which
+  the signal is modelled by variables with a piecewise constant distribution. 
+  In this setting, changepoints detection equals maximum likelihood estimation 
+  with the sum of the cost equal to the negative log-likelihood. The distributions
+  need to be known (prior knowledge) and the model is either a change in mean or
+  a change in mean and scale (not with real heteroscedastic data: mean and std
+  multiplied with the same factor?)
+
+-------------------------------------------------------------------------------
+RUPTURES - COST FUNCTIONS
+
+
+
+RUPTURES - BASE FUNCTIONS
+* error(start,end) -- returns the cost on segment (start,end)
+* fit(*args,**kwargs)  -- set parameters of the 
+
+
+"""
+
+
+import os
+os.chdir(r"C:\Users\adria036\OneDrive - Wageningen University & Research\iAdriaens_doc\Projects\iAdriaens\bait\scripts\bait\datapreparation\ines") 
+
+#%% filepaths, constants and load data
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import ruptures as rpt
+
+# path
+path = os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","data","preprocessed")
+
+# read dataset 
+data = pd.read_csv(path+"\\DCdata.csv", index_col=0)
+data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S.%f%z")
+
+del path
+
+#%% segmentation tests
+
+# define parameters for segmentation
+model = "l1"
+min_size = 5  # minimum distance between changepoints
+
+
+data = 
+
+
+#
+
+
+