diff --git a/datapreparation/harmen/preprocess_sheep1data_back.py b/datapreparation/harmen/preprocess_sheep1data_back.py
index 82f3dafb58e28095647d495689e3529e106aade1..2041d5d9e4afb2114c29ff64f36ed85c37bbca7f 100644
--- a/datapreparation/harmen/preprocess_sheep1data_back.py
+++ b/datapreparation/harmen/preprocess_sheep1data_back.py
@@ -90,10 +90,6 @@ data["acc_xm"] = data["acc_x"].rolling(10).median()
 data["acc_ym"] = data["acc_y"].rolling(10).median() 
 data["acc_zm"] = data["acc_z"].rolling(10).median() 
 
-data["acc_xm2"] = data["acc_x"].rolling(60).median() 
-data["acc_ym2"] = data["acc_y"].rolling(60).median() 
-data["acc_zm2"] = data["acc_z"].rolling(60).median() 
-
 # plot
 fig,ax = plt.subplots(nrows=3,ncols=1,figsize = (20,10),sharex=True)
 data2 = data.loc[data["at"].dt.day==29,:].sort_values(by="at").reset_index(drop=1)
diff --git a/tool/classification.py b/tool/classification.py
index 88dd78e701736920dceebf636201b4a0e68475a3..cf2b14ee8fdceaf0a5946ab5c17c7eede6638eaf 100644
--- a/tool/classification.py
+++ b/tool/classification.py
@@ -51,6 +51,7 @@ Calculated:
 """
 
 usr_feat = ["level", "std", "iqr", "freq"]
+data_standardize = True
 
 #%% load data
 
@@ -70,10 +71,11 @@ variables = ["acc_xm","acc_ym","acc_zm"]
 dim = len(variables)
 
 # standardise the data
-for var in variables:
-    data[var] = (data[var] - data[var].min()) / (data[var].max()-data[var].min())
+if data_standardize == True:
+    for var in variables:
+        data[var] = (data[var] - data[var].min()) / (data[var].max()-data[var].min())
 
-del var
+    del var
 
 
 #%% define features
diff --git a/tool/classification_supervised.py b/tool/classification_supervised.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd59f89458c726dc865cc727a370087dcff021a0
--- /dev/null
+++ b/tool/classification_supervised.py
@@ -0,0 +1,371 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jan 11 10:01:13 2023
+
+@author: adria036
+------------------------------------------------------------------------------
+
+Classification based on supervised learning of segment features
+per segment, following features can be calculated:
+    - mean/level each dimension
+    - range  each dimension
+    - Q25-Q75 - IQR each dimension
+    - std each dimension
+    - length of the segment
+    - skewness
+    - gapsize
+    - % outliers
+    - mean/level each dimension of the previous segment
+    - range each dimension of the previous segment
+    - Q25-Q75 - IQR each dimension of the previous segment
+    - std each dimension of the previous segment
+    - length of the segment of the previous segment
+    - % outliers of the previous segment
+
+
+
+
+"""
+
+
+import pandas as pd
+import numpy as np
+import os
+import matplotlib.pyplot as plt
+from scipy.integrate import trapz
+from scipy.fft import rfft, rfftfreq
+from sklearn.model_selection import train_test_split
+import seaborn as sns
+from random import sample
+#%matplotlib qt
+
+
+#%% for user interface: decide which features are expected to differ across the categories of interest
+"""
+Calculated: 
+    - level
+    - std
+    - iqr
+    - integral of frequency domein components (freq > 0) calc.
+    - skewness
+    - kurtosis
+"""
+
+# set hyperparameters and settings
+settings = {"user_feat" : ["level", "std", "iqr", "freq"],
+            "data_standardize" : True,
+            "crossval" : 5,
+            "data_split_by" : "time",       # "time" or "random"
+            "data_split_days" : 5,          # number of days to train on if "time"
+            "data_partitioning" : 90,       # % randomly selected data if "random"
+            "ground_truth" : "area",        # colname of ground truth e.g. "Behaviour" or "area"
+            "id" : "cowid",                 # id col name
+            "variables" : ["acc_x","acc_y","acc_z"] # variable names
+            }  
+
+# TODO: in models if num classes more than two no logistic regression
+# TODO: try ensembles
+# TODO: feature selection step in crossvalidation
+
+
+#%% load data
+
+path = os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","results","preprocessed","cow")
+
+data = pd.read_csv(path + "//behaviour_annotated_cows.txt", index_col=0)
+data["at"] =  pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S")
+del path
+
+# with too few ground truth for class/area 7 and 5 PER COW: set to 0 (walk)
+data.loc[((data["area"] == 7)  | (data["area"] == 5)),"area"] = 0
+
+
+# set dimensionality of the data
+dim = len(settings["variables"])
+variables = settings["variables"]
+
+# standardise the data
+if settings["data_standardize"] == True:
+    for var in variables:
+        data[var] = (data[var] - data[var].min()) / (data[var].max()-data[var].min())
+
+    del var
+
+# set output path
+path_out =  os.path.join("C:","/Users","adria036",
+                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
+                    "Projects","iAdriaens","bait","results","classification","cow")
+
+
+#%% calculate features
+
+# prepare dataframe
+segments = pd.DataFrame([],index = data["segment"].drop_duplicates().values.astype(int))
+segs = data["segment"].drop_duplicates().values
+
+# calculate extra vars for feature design
+data["timediff"] = data["at"].diff()
+data["timediff"] = data["timediff"].dt.seconds
+
+# calculate features
+if "dur" in settings["user_feat"]:
+    segments["duration"] = data[["segment",variables[0]]].groupby(by="segment").count()
+    
+# segments["gap"] = data[["segment","timediff"]].groupby(by="segment").max()
+for i in range(0,dim):
+    if "level" in settings["user_feat"]:
+        segments["level_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").mean().values
+    if "std" in settings["user_feat"]:
+        segments["std_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").std().values
+    if "range" in settings["user_feat"]:
+        segments["range_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").max().values - \
+                                data[["segment",variables[i]]].groupby(by="segment").min().values
+    if "iqr" in settings["user_feat"]:
+        segments["iqr_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").describe().values[:,6] - \
+                              data[["segment",variables[i]]].groupby(by="segment").describe().values[:,4]
+    if "skew" in settings["user_feat"]:
+        segments["skew_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").skew()
+    if "kurt" in settings["user_feat"]:
+        segments["kurt_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").apply(pd.DataFrame.kurt).values[:,1]
+    
+    if "freq" in settings["user_feat"]:
+        segments["freq_"+str(i)] = np.nan                                                
+        for seg in segs:
+            yf = np.abs(rfft(data.loc[data["segment"]==seg,variables[i]].values))
+            xf = rfftfreq(len(data.loc[data["segment"]==seg]),1)
+                
+            # calculate integral of frequencies > 0
+            segments["freq_"+str(i)].iloc[int(seg)-1] = round(trapz(yf[1:],xf[1:]),3)
+        del xf, yf, seg
+del segs, i                        
+
+# if features too unbalanced, clustering learns the feature size instead of cluster difference
+for i in range(0,segments.shape[1]):
+    segments.iloc[:,i] = (segments.iloc[:,i] - segments.iloc[:,i].min()) / \
+        (segments.iloc[:,i].max() - segments.iloc[:,i].min())
+
+#------------------------------------------------------------------------------
+# explore (relation between) features
+matrix = segments.corr()            # correlation matrix
+summary = segments.describe()       # summary
+
+fig, ax = plt.subplots(nrows=1,ncols=2, figsize=(14,6))
+bp = sns.boxplot(segments,ax= ax[0], palette ="viridis",fliersize = 2)
+if segments.shape[1] < 10:
+    ax[1] = sns.heatmap(segments.corr(), cmap = "RdBu",annot = True, fontsize = 6)
+else:
+    ax[1] = sns.heatmap(segments.corr(), cmap = "RdBu",annot = False)
+ax[0].set_ylabel("Standardised feature values")
+ax[0].set_xlabel("Feature")
+ax[0].set_title("Feature distribution")
+ax[1].set_title("Feature correlations")
+ax[0].set_xticklabels(ax[0].get_xticklabels(),rotation=45)
+ax[1].set_xticklabels(ax[1].get_xticklabels(),rotation=45)
+
+
+del i, fig, ax
+del bp
+
+
+flist = []
+for idx in range(0,segments.shape[1]):
+    ad = [segments.columns[idx] for m in settings["user_feat"] if m in segments.columns[idx]]
+    if len(ad) > 0: 
+        flist.append(ad[0])
+
+#------------------------------------------------------------------------------
+# make box plots of features per dimension and per class in "truth"
+fig,axes = plt.subplots(ncols = dim, nrows = len(settings["user_feat"]), 
+                   figsize = (5*dim,4*len(settings["user_feat"])), sharex = True)
+beh = ["walk","cubicle","feed"]
+for i in range(len(settings["user_feat"])):
+    for j in range(dim):
+        colname = settings["user_feat"][i] + '_' + str(j)
+        print(colname)
+        sns.boxplot(x = "truth", y = colname, data = segments, ax = axes[i][j])
+        if i == len(settings["user_feat"])-1:
+            axes[i][j].set_xticklabels(beh,rotation=0)
+        if i == 0:
+            axes[i][j].set_title(settings["variables"][j])
+
+#------------------------------------------------------------------------------
+# bar plots per behaviour per cow per day (number of segments)
+days = segments["day"].drop_duplicates().reset_index(drop=1)
+ids = segments["id"].drop_duplicates().reset_index(drop=1)
+
+# add behaviour to segments
+for j in range(len(ids)):
+    fig, axes = plt.subplots(ncols = 1,
+                             nrows = 2, 
+                       figsize = (20,15))
+                       
+    subset = segments.loc[segments["id"] == ids[j],["day","truth","id"]]
+    subset1 = subset.groupby(by = ["day","truth"]).count().reset_index()
+    subset2 = subset[["day","id"]].groupby(by = ["day"]).count().reset_index()
+    dfcol = pd.merge(subset1,subset2,how = "outer", on = "day")
+    dfcol["perc"] = dfcol["id_x"]/dfcol["id_y"]*100
+    sns.barplot(data = dfcol, x = "day" , y = "perc", palette = "rocket", hue = "truth", ax = axes[0])
+    h, l = axes[0].get_legend_handles_labels()
+    axes[0].legend(h, beh, title="Behaviour")
+    axes[0].set_title("cow " + str(round(ids[j])))
+    axes[0].set_ylabel("percentage of segments in gold standard")
+    sns.barplot(data = dfcol, x = "day" , y = "id_x", palette = "rocket", hue = "truth", ax = axes[1])
+    h, l = axes[1].get_legend_handles_labels()
+    axes[1].legend(h, beh, title="Behaviour")
+    axes[1].set_title("cow " + str(round(ids[j])))
+    axes[1].set_ylabel("number of segments in gold standard")
+    plt.savefig(path_out + "//overview_behaviour_cow_" + str(round(ids[j])))
+    plt.close()
+
+del ids, days
+
+#%% data split
+
+
+# add ground truth to "segments" data frame
+gs_index = data["segment"].drop_duplicates().index.values
+segments["truth"] = data[settings["ground_truth"]].iloc[gs_index].values
+segments["day"] = data["day"].iloc[gs_index].values
+segments["id"] = data[settings["id"]].iloc[gs_index].values
+del gs_index
+
+if "time" in settings["data_split_by"]:
+    
+    # for each cow, select the first settings["data_split_days"] days as the training
+    df = {"train" : segments.loc[segments["day"] <= settings["data_split_days"],:],
+          "test" : segments.loc[segments["day"] > settings["data_split_days"],:]}  
+    
+else:    
+    # use sklearn model selection train_test_split
+    X_train, X_test = train_test_split(segments,
+         test_size = 1-(settings["data_partitioning"]/100), 
+         stratify = segments[["id","truth"]] 
+         )   # stratify for behaviours
+    X_train = X_train.sort_index(axis=0)    
+    X_test = X_test.sort_index(axis=0)
+    
+    # put in dictionary    
+    df = {"train" : X_train,
+          "test" : X_test}
+    
+    del X_train, X_test
+
+#------------------------------------------------------------------------------
+# TODO: explore test and train datasets created -- overall
+# TODO: explore test and train datasets created -- per individual id
+# !!! set feature list
+
+    
+
+
+
+
+
+
+
+#%% supervised learning for all ids in the dataset
+
+from sklearn.svm import SVC
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.naive_bayes import GaussianNB
+
+model_pipeline = []
+model_pipeline.append(SVC())
+model_pipeline.append(KNeighborsClassifier())
+model_pipeline.append(DecisionTreeClassifier())
+model_pipeline.append(RandomForestClassifier())
+model_pipeline.append(GaussianNB())
+
+#Model running and evaluation
+from sklearn import metrics
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+
+model_list = ["SVM", "KNN","Decision Tree", "Random Forest", "Naive Bayes"]
+acc_list = []
+auc_list = []
+cm_list = []
+
+
+
+for model in model_pipeline:
+    print(model)
+    model.fit(df["train"][flist], df["train"]["truth"])
+    y_pred = model.predict(df["test"][flist])
+    acc_list.append(metrics.accuracy_score(df["test"]["truth"], y_pred))
+    #fpr, tpr, _thresholds = metrics.roc_curve(np.array(y_test), y_pred)
+    #auc_list.append(round(metrics.auc(fpr, tpr), 2))
+    cm_list.append(confusion_matrix(df["test"]["truth"], y_pred))        
+
+#visualise confusion matrix
+fig = plt.figure(figsize = (15,10))
+for i in range(len(cm_list)):
+    print(i)
+    cm = cm_list[i]
+    model = model_list[i]
+    sub = fig.add_subplot(2,3,i+1).set_title(model)
+    cm_plot = sns.heatmap(cm, annot = True, cmap = 'Blues_r')
+    cm_plot.set_xlabel('Predicted Values')
+    cm_plot.set_ylabel('Actual Values')
+
+# Summarize accuracy
+result_df = pd.DataFrame({'Model': model_list, 'Accuracy': acc_list})
+
+
+
+#%% supervised learning per individual id in the dataset
+
+from sklearn.svm import SVC
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.naive_bayes import GaussianNB
+
+model_pipeline = []
+model_pipeline.append(SVC())
+model_pipeline.append(KNeighborsClassifier())
+model_pipeline.append(DecisionTreeClassifier())
+model_pipeline.append(RandomForestClassifier())
+model_pipeline.append(GaussianNB())
+
+#Model running and evaluation
+from sklearn import metrics
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+
+model_list = ["SVM", "KNN","Decision Tree", "Random Forest", "Naive Bayes"]
+acc_list = []
+auc_list = []
+cm_list = []
+
+# set feature list
+flist = []
+for idx in range(0,segments.shape[1]):
+    ad = [segments.columns[idx] for m in settings["user_feat"] if m in segments.columns[idx]]
+    if len(ad) > 0: 
+        flist.append(ad[0])
+
+for model in model_pipeline:
+    model.fit(df["train"][flist], df["test"]["truth"])
+    y_pred = model.predict(x_test)
+    acc_list.append(metrics.accuracy_score(y_test, y_pred))
+    #fpr, tpr, _thresholds = metrics.roc_curve(np.array(y_test), y_pred)
+    #auc_list.append(round(metrics.auc(fpr, tpr), 2))
+    cm_list.append(confusion_matrix(y_test, y_pred))        
+
+#visualise confusion matrix
+fig = plt.figure(figsize = (15,10))
+for i in range(len(cm_list)):
+    cm = cm_list[i]
+    model = model_list[i]
+    sub = fig.add_subplot(2,3,i+1).set_title(model)
+    cm_plot = sns.heatmap(cm, annot = True, cmap = 'Blues_r')
+    cm_plot.set_xlabel('Predicted Values')
+    cm_plot.set_ylabel('Actual Values')
+
+# Summarize accuracy
+result_df = pd.DataFrame({'Model': model_list, 'Accuracy': acc_list})
diff --git a/tool/overview_behaviour_cow_1091.png b/tool/overview_behaviour_cow_1091.png
new file mode 100644
index 0000000000000000000000000000000000000000..024587fccdac51559caba24829966328de7c3fc0
Binary files /dev/null and b/tool/overview_behaviour_cow_1091.png differ
diff --git a/tool/overview_behaviour_cow_1681.png b/tool/overview_behaviour_cow_1681.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce0a9c783ea857036ee714f650ae7c3a40ebdce2
Binary files /dev/null and b/tool/overview_behaviour_cow_1681.png differ
diff --git a/tool/overview_behaviour_cow_3089.png b/tool/overview_behaviour_cow_3089.png
new file mode 100644
index 0000000000000000000000000000000000000000..ba6f5ab45f03346df7a8d4a9c9b8d12fd06e1ac8
Binary files /dev/null and b/tool/overview_behaviour_cow_3089.png differ
diff --git a/tool/overview_behaviour_cow_3106.png b/tool/overview_behaviour_cow_3106.png
new file mode 100644
index 0000000000000000000000000000000000000000..2059e80fc42505ec0d71c3120a4f6e22cb1ce34e
Binary files /dev/null and b/tool/overview_behaviour_cow_3106.png differ
diff --git a/tool/overview_behaviour_cow_3152.png b/tool/overview_behaviour_cow_3152.png
new file mode 100644
index 0000000000000000000000000000000000000000..12b358e73fb9e54e947a8b8d37d4d6e710db94cd
Binary files /dev/null and b/tool/overview_behaviour_cow_3152.png differ
diff --git a/tool/overview_behaviour_cow_419.png b/tool/overview_behaviour_cow_419.png
new file mode 100644
index 0000000000000000000000000000000000000000..594bd9f58fe1db6bd3bc5b51da0b1a0c6abee537
Binary files /dev/null and b/tool/overview_behaviour_cow_419.png differ
diff --git a/tool/overview_behaviour_cow_605.png b/tool/overview_behaviour_cow_605.png
new file mode 100644
index 0000000000000000000000000000000000000000..082cd00bb22fb0e7b9f5c11ef653844088dcd97e
Binary files /dev/null and b/tool/overview_behaviour_cow_605.png differ
diff --git a/tool/overview_behaviour_cow_7387.png b/tool/overview_behaviour_cow_7387.png
new file mode 100644
index 0000000000000000000000000000000000000000..47576006fe551e94015debc7800e72a30400387b
Binary files /dev/null and b/tool/overview_behaviour_cow_7387.png differ
diff --git a/tool/overview_behaviour_cow_821.png b/tool/overview_behaviour_cow_821.png
new file mode 100644
index 0000000000000000000000000000000000000000..042b599a2ac748239cc9dd2fd0169fe8e51a56eb
Binary files /dev/null and b/tool/overview_behaviour_cow_821.png differ
diff --git a/tool/segmentation.py b/tool/segmentation.py
index 53dd3273c2816254554d3d4ea60913ec87b3516a..43fdd2e7fb0e3a70b8c3af86e8a1a777918fad9b 100644
--- a/tool/segmentation.py
+++ b/tool/segmentation.py
@@ -103,150 +103,169 @@ import numpy as np
 import matplotlib.pyplot as plt
 import ruptures as rpt
 from scipy.signal import medfilt
-
-# path
-path = os.path.join("C:","/Users","adria036",
-                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
-                    "Projects","iAdriaens","bait","data","preprocessed")
+from datetime import timedelta
+
+# path to data
+# path = os.path.join("C:","/Users","adria036",
+#                     "OneDrive - Wageningen University & Research","iAdriaens_doc",
+#                     "Projects","iAdriaens","bait","data","preprocessed")
+# path to data SP
+path = os.path.join("W:","ASG","WLR_Dataopslag","Genomica","Sustainable_breeding",
+                    "4400003586_BAIT","Dataset_Ines_Marja")
+
+fns = [f for f in os.listdir(path) \
+        if os.path.isfile(os.path.join(path,f)) \
+            and ("newseldata_cow_" in f) \
+            and ("_2022" not in f)]
+fns = pd.DataFrame(fns,columns = ["fn"])
+fns["cowid"] = fns["fn"].iloc[:].str[15:-4].astype(int)
 
 # read dataset accelerations
 # data = pd.read_csv(path+"\\DCdata2.txt", index_col=0)
 # data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S.%f%z")
 
-
 # read dataset 
 # data = pd.read_csv(path+"\\DCdata.csv", index_col=0)
 # data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S.%f%z")
 # # #data = data[["cowid","at","t","gap","activity"]]
 
-# dataset sheep
-data = pd.read_csv(path+"\\HDdata_sheep1_back.csv", index_col=0)
-data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S")
-
-# plot
-# fig,ax = plt.subplots(nrows=3,ncols=1,figsize = (20,10),sharex=True)
-# data2 = data.loc[data["at"].dt.day==6,:].sort_values(by="at").reset_index(drop=1)
-# ax[0].plot(data2["at"],data2["acc_x"],color = "teal")
-# ax[1].plot(data2["at"],data2["acc_y"],color = "teal")
-# ax[2].plot(data2["at"],data2["acc_z"],color = "teal")
-# ax[0].plot(data2["at"],data2["acc_xm"],color = "k")
-# ax[1].plot(data2["at"],data2["acc_ym"],color = "k")
-# ax[2].plot(data2["at"],data2["acc_zm"],color = "k")
-
-del path
-
-#%% segmentation tests
-# savepath
-path = os.path.join("C:","/Users","adria036",
-                    "OneDrive - Wageningen University & Research","iAdriaens_doc",
-                    "Projects","iAdriaens","bait","results","segmentation")
-"""
-# define parameters for segmentation
-
-# set minimum time instead of minimum size and set signal
-"""
-model = "normal"
-min_size = 2*60  # minimum distance between changepoints
-dim = 3
-# variable = ["acc_x","acc_y","acc_z"]
-variable = ["acc_xm","acc_ym","acc_zm"]
-idname = "id"
-tname = "at"
-medfilter = 0
-# select data and perform segmentation
-
-allids = data[idname].drop_duplicates()
-nseg = 1
-brkpnts = pd.DataFrame([])
-output = pd.DataFrame([])
-for ids in allids:
-    days = data[tname].dt.day.drop_duplicates()
-    for day in days:
-        # select data
-        signal = data.loc[(data[idname]==ids) & \
-                  (data[tname].dt.day == day) \
-                  ,variable]
-        signal = signal.dropna()
-        indx = signal.index.values
-        signal = signal.to_numpy()
-        
-        # filter data to remove noise / errors
-        if (medfilter == 1): 
-            for d in range(0,signal.shape[1]):
-                mfilt = medfilt(signal[:,d],21)
-                signal[:,d] = mfilt
-        
-        # set penalty values // rule of thumb = log(n)*dim*std(signal)
-        #pen = np.log(len(signal)) * dim * np.std(signal, axis=0).mean()**2
-        pen = np.log(len(signal)) * dim * np.std(signal)**2
-        if pen < 700:
-             pen = 950
-        #pen = 950
-        print("id = " + str(ids))
-        print("pen = " + str(pen))
-        print("day = " + str(day))
-        # if segment std ~= signal std-  this might not work
-        
+# # dataset sheep
+# data = pd.read_csv(path+"\\HDdata_sheep1_back.csv", index_col=0)
+# data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S")
+
+# read data  SP
+for f in fns["fn"]:
+    print(f)
+# fn = "newseldata_cow_280.txt"
+    data = pd.read_csv(os.path.join(path,f), 
+                       usecols = ["cowid","barn","date","t","acc_xs","acc_ys","acc_zs"] )
+    data["date"]= pd.to_datetime(data["date"],format = "%Y-%m-%d")
+    data["datetime"] = data["date"] + \
+        (pd.to_datetime(data["t"],unit = 's') - pd.to_datetime("1970-01-01",format = "%Y-%m-%d"))
+    dcol = "date"
+    data["day"] = (data["date"] - data["date"].min()).dt.days
+
+    # plot %matplotlib qt  ---- this is and example
+    # fig,ax = plt.subplots(nrows=3,ncols=1,figsize = (20,10),sharex=True)
+    # data2 = data.loc[data[dcol].dt.day==27,:].sort_values(by="t").reset_index(drop=1)
+    # ax[0].plot(data2["t"],data2["acc_xs"],color = "lightgrey")
+    # ax[1].plot(data2["t"],data2["acc_ys"],color = "teal")
+    # ax[2].plot(data2["t"],data2["acc_zs"],color = "teal")
+    #ax[0].plot(data2["t"],data2["acc_xs"].rolling(10).median(),color = "k")
+    #ax[1].plot(data2["t"],data2["acc_ys"].rolling(10).median(),color = "k")
+    #ax[2].plot(data2["t"],data2["acc_zs"].rolling(10).median(),color = "k")
+
+    # del path
+
+    #%% segmentation tests
+    # savepath
+    # path = os.path.join("C:","/Users","adria036",
+    #                     "OneDrive - Wageningen University & Research","iAdriaens_doc",
+    #                     "Projects","iAdriaens","bait","results","segmentation")
+    svpath = os.path.join("W:","ASG","WLR_Dataopslag","Genomica","Sustainable_breeding",
+                        "4400003586_BAIT","Dataset_Ines_Marja","segmentation")
+    """
+    # define parameters for segmentation
+    
+    # set minimum time instead of minimum size and set signal
+    """
+    model = "normal"
+    min_size = 2*60  # minimum distance between changepoints = in seconds
+    dim = 3
+    # variable = ["acc_x","acc_y","acc_z"]
+    variable = ["acc_xs","acc_ys","acc_zs"]
+    idname = "cowid"
+    tname = "datetime"
+    medfilter = 1
+    # select data and perform segmentation
     
-        # fit and define changepoint model
-        # algo = rpt.Pelt(model=model,min_size=min_size).fit(signal)
-        
-        c = rpt.costs.CostNormal().fit(signal)
-        algo = rpt.Pelt(custom_cost=rpt.costs.CostNormal(),min_size=min_size).fit(signal)
-        #algo = rpt.Pelt(model="rbf",min_size=min_size).fit(signal)
-        cpts = algo.predict(pen = pen)
-       
-        # save breakpoints in new variable
-        df = pd.DataFrame(np.linspace(nseg,nseg+len(cpts)-1,len(cpts)).astype(int),columns=["segment"])
-        df["start"] = [0]+cpts[:-1]
-        df["end"] = cpts[:-1]+[len(signal)]
-        df["end"] = df["end"]-1
-        nseg = nseg + len(cpts)
-        
-        # initiate figure
-        fig, axes = rpt.display(signal, cpts,figsize = (18,9))
-        
-        # add breakpoints / segments to signal
-        signal = pd.DataFrame(signal,columns = variable)
-        signal["segment"] = np.nan
-        signal.iloc[df["start"].values,signal.columns.get_indexer(["segment"])] = df["segment"].values
-        signal.iloc[df["end"].values,signal.columns.get_indexer(["segment"])]  = df["segment"].values
-        signal = signal.fillna(method="pad")
-        
-        signal.index = indx
-        signal["at"] = data.loc[(data[idname]==ids) & \
-                  (data[tname].dt.day == day) & \
-                  (~data[tname].isna()),"at"]
-        signal["date2"] = signal["at"].dt.strftime("%d/%m %H:%M")
-        signal["id"] = ids
-        
-        # plot
-        axes[0].set_title("acceleration in x direction")
-        axes[1].set_title("acceleration in y direction")
-        axes[2].set_title("acceleration in z direction")
-        
-        axes[0].set_ylabel("acceleration [m/s²]")
-        axes[1].set_ylabel("acceleration [m/s²]")
-        axes[2].set_ylabel("acceleration [m/s²]")
-        axes[2].set_xlabel("date")
-        try:
-            labels = signal.index.values[0:-1:10000] #-signal.index.values[0]
-            labels = labels.tolist()
-            labelnames = signal['date2'].iloc[labels]    
-            plt.xticks(labels,labels=labelnames)
-        except:
-            #donothing
-            print("failed to set date labels")
-        plt.tight_layout()
-        plt.savefig(path+"\\add_segm_sheep1back"+str(round(ids))+"_day"+str(round(day))+".tif")
-        plt.close()
+    allids = data[idname].drop_duplicates()
+    nseg = 1
+    brkpnts = pd.DataFrame([])
+    output = pd.DataFrame([])
+    for ids in allids:
+        days = data["day"].drop_duplicates()
+        for day in days:
+            # select data
+            signal = data.loc[(data[idname]==ids) & \
+                      (data["day"] == day) \
+                      ,variable]
+            signal = signal.dropna()
+            indx = signal.index.values
+            signal = signal.to_numpy()
+            if len(signal) > 3600:
             
-        
-        # add signal to output and add df to brkpnts
-        output = pd.concat([output,signal])
-        brkpnts = pd.concat([brkpnts,df])
-        
-output.to_csv(path + "\\HD_sheep1back_segmented_2min.csv")
-brkpnts.to_csv(path+"\\HD_sheep1back_breakpoints_2min.csv")
-
-#%% Create outputpath = 
+                # filter data to remove noise / errors
+                if (medfilter == 1): 
+                    for d in range(0,signal.shape[1]):
+                        mfilt = medfilt(signal[:,d],5)
+                        signal[:,d] = mfilt
+            
+                # set penalty values // rule of thumb = log(n)*dim*std(signal)
+                #pen = np.log(len(signal)) * dim * np.std(signal, axis=0).mean()**2
+                pen = np.log(len(signal)) * dim * np.std(signal)**2
+                # if pen < 700:
+                #      pen = 950
+                pen = 350   # for the SP data, this seems a generally good threshold
+                
+                print("id = " + str(ids))
+                print("pen = " + str(pen))
+                print("day = " + str(day))
+                # if segment std ~= signal std-  this might not work
+                
+            
+                # fit and define changepoint model
+                # algo = rpt.Pelt(model=model,min_size=min_size).fit(signal)
+                
+                c = rpt.costs.CostNormal().fit(signal)
+                algo = rpt.Pelt(custom_cost=rpt.costs.CostNormal(),min_size=min_size).fit(signal)
+                #algo = rpt.Pelt(model="rbf",min_size=min_size).fit(signal)
+                cpts = algo.predict(pen = pen)
+               
+                # save breakpoints in new variable
+                df = pd.DataFrame(np.linspace(nseg,nseg+len(cpts)-1,len(cpts)).astype(int),columns=["segment"])
+                df["start"] = [0]+cpts[:-1]
+                df["end"] = cpts[:-1]+[len(signal)]
+                df["end"] = df["end"]-1
+                nseg = nseg + len(cpts)
+                
+                # initiate figure
+                fig, axes = rpt.display(signal, cpts,figsize = (18,9))
+                
+                # add breakpoints / segments to signal
+                signal = pd.DataFrame(signal,columns = variable)
+                signal["segment"] = np.nan
+                signal.iloc[df["start"].values,signal.columns.get_indexer(["segment"])] = df["segment"].values
+                signal.iloc[df["end"].values,signal.columns.get_indexer(["segment"])]  = df["segment"].values
+                signal = signal.fillna(method="pad")
+                
+                signal.index = indx
+                signal["at"] = data.loc[(data[idname]==ids) & \
+                          (data[tname].dt.day == day) & \
+                          (~data[tname].isna()),dcol]
+                signal["date2"] = signal["at"].dt.strftime("%d/%m %H:%M")
+                signal["id"] = ids
+                
+                # plot
+                axes[0].set_title("cow = " + str(int(ids)) + ", day = " +str(day) +   "acceleration in x direction")
+                axes[1].set_title("acceleration in y direction")
+                axes[2].set_title("acceleration in z direction")
+                
+                axes[0].set_ylabel("acceleration [m/s²]")
+                axes[1].set_ylabel("acceleration [m/s²]")
+                axes[2].set_ylabel("acceleration [m/s²]")
+                axes[2].set_xlabel("measurement no.")
+                
+                plt.tight_layout()
+                plt.savefig(svpath+"\\add_segm_cow_"+str(round(ids))+"_day"+str(round(day))+".tif")
+                plt.close()
+                    
+                
+                # add signal to output and add df to brkpnts
+                output = pd.concat([output,signal])
+                brkpnts = pd.concat([brkpnts,df])
+            
+    output.to_csv(svpath + "\\SP_cow_" + str(ids) + "_segmented_2min.txt")
+    brkpnts.to_csv(svpath+"\\SP_cow_" + str(ids) + "_breakpoints_2min.txt")
+
+