diff --git a/datapreparation/harmen/preprocess_sheep1data_back.py b/datapreparation/harmen/preprocess_sheep1data_back.py index 82f3dafb58e28095647d495689e3529e106aade1..2041d5d9e4afb2114c29ff64f36ed85c37bbca7f 100644 --- a/datapreparation/harmen/preprocess_sheep1data_back.py +++ b/datapreparation/harmen/preprocess_sheep1data_back.py @@ -90,10 +90,6 @@ data["acc_xm"] = data["acc_x"].rolling(10).median() data["acc_ym"] = data["acc_y"].rolling(10).median() data["acc_zm"] = data["acc_z"].rolling(10).median() -data["acc_xm2"] = data["acc_x"].rolling(60).median() -data["acc_ym2"] = data["acc_y"].rolling(60).median() -data["acc_zm2"] = data["acc_z"].rolling(60).median() - # plot fig,ax = plt.subplots(nrows=3,ncols=1,figsize = (20,10),sharex=True) data2 = data.loc[data["at"].dt.day==29,:].sort_values(by="at").reset_index(drop=1) diff --git a/tool/classification.py b/tool/classification.py index 88dd78e701736920dceebf636201b4a0e68475a3..cf2b14ee8fdceaf0a5946ab5c17c7eede6638eaf 100644 --- a/tool/classification.py +++ b/tool/classification.py @@ -51,6 +51,7 @@ Calculated: """ usr_feat = ["level", "std", "iqr", "freq"] +data_standardize = True #%% load data @@ -70,10 +71,11 @@ variables = ["acc_xm","acc_ym","acc_zm"] dim = len(variables) # standardise the data -for var in variables: - data[var] = (data[var] - data[var].min()) / (data[var].max()-data[var].min()) +if data_standardize == True: + for var in variables: + data[var] = (data[var] - data[var].min()) / (data[var].max()-data[var].min()) -del var + del var #%% define features diff --git a/tool/classification_supervised.py b/tool/classification_supervised.py new file mode 100644 index 0000000000000000000000000000000000000000..fd59f89458c726dc865cc727a370087dcff021a0 --- /dev/null +++ b/tool/classification_supervised.py @@ -0,0 +1,371 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jan 11 10:01:13 2023 + +@author: adria036 +------------------------------------------------------------------------------ + +Classification based on supervised learning of segment features +per segment, following features can be calculated: + - mean/level each dimension + - range each dimension + - Q25-Q75 - IQR each dimension + - std each dimension + - length of the segment + - skewness + - gapsize + - % outliers + - mean/level each dimension of the previous segment + - range each dimension of the previous segment + - Q25-Q75 - IQR each dimension of the previous segment + - std each dimension of the previous segment + - length of the segment of the previous segment + - % outliers of the previous segment + + + + +""" + + +import pandas as pd +import numpy as np +import os +import matplotlib.pyplot as plt +from scipy.integrate import trapz +from scipy.fft import rfft, rfftfreq +from sklearn.model_selection import train_test_split +import seaborn as sns +from random import sample +#%matplotlib qt + + +#%% for user interface: decide which features are expected to differ across the categories of interest +""" +Calculated: + - level + - std + - iqr + - integral of frequency domein components (freq > 0) calc. + - skewness + - kurtosis +""" + +# set hyperparameters and settings +settings = {"user_feat" : ["level", "std", "iqr", "freq"], + "data_standardize" : True, + "crossval" : 5, + "data_split_by" : "time", # "time" or "random" + "data_split_days" : 5, # number of days to train on if "time" + "data_partitioning" : 90, # % randomly selected data if "random" + "ground_truth" : "area", # colname of ground truth e.g. "Behaviour" or "area" + "id" : "cowid", # id col name + "variables" : ["acc_x","acc_y","acc_z"] # variable names + } + +# TODO: in models if num classes more than two no logistic regression +# TODO: try ensembles +# TODO: feature selection step in crossvalidation + + +#%% load data + +path = os.path.join("C:","/Users","adria036", + "OneDrive - Wageningen University & Research","iAdriaens_doc", + "Projects","iAdriaens","bait","results","preprocessed","cow") + +data = pd.read_csv(path + "//behaviour_annotated_cows.txt", index_col=0) +data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S") +del path + +# with too few ground truth for class/area 7 and 5 PER COW: set to 0 (walk) +data.loc[((data["area"] == 7) | (data["area"] == 5)),"area"] = 0 + + +# set dimensionality of the data +dim = len(settings["variables"]) +variables = settings["variables"] + +# standardise the data +if settings["data_standardize"] == True: + for var in variables: + data[var] = (data[var] - data[var].min()) / (data[var].max()-data[var].min()) + + del var + +# set output path +path_out = os.path.join("C:","/Users","adria036", + "OneDrive - Wageningen University & Research","iAdriaens_doc", + "Projects","iAdriaens","bait","results","classification","cow") + + +#%% calculate features + +# prepare dataframe +segments = pd.DataFrame([],index = data["segment"].drop_duplicates().values.astype(int)) +segs = data["segment"].drop_duplicates().values + +# calculate extra vars for feature design +data["timediff"] = data["at"].diff() +data["timediff"] = data["timediff"].dt.seconds + +# calculate features +if "dur" in settings["user_feat"]: + segments["duration"] = data[["segment",variables[0]]].groupby(by="segment").count() + +# segments["gap"] = data[["segment","timediff"]].groupby(by="segment").max() +for i in range(0,dim): + if "level" in settings["user_feat"]: + segments["level_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").mean().values + if "std" in settings["user_feat"]: + segments["std_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").std().values + if "range" in settings["user_feat"]: + segments["range_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").max().values - \ + data[["segment",variables[i]]].groupby(by="segment").min().values + if "iqr" in settings["user_feat"]: + segments["iqr_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").describe().values[:,6] - \ + data[["segment",variables[i]]].groupby(by="segment").describe().values[:,4] + if "skew" in settings["user_feat"]: + segments["skew_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").skew() + if "kurt" in settings["user_feat"]: + segments["kurt_"+str(i)] = data[["segment",variables[i]]].groupby(by="segment").apply(pd.DataFrame.kurt).values[:,1] + + if "freq" in settings["user_feat"]: + segments["freq_"+str(i)] = np.nan + for seg in segs: + yf = np.abs(rfft(data.loc[data["segment"]==seg,variables[i]].values)) + xf = rfftfreq(len(data.loc[data["segment"]==seg]),1) + + # calculate integral of frequencies > 0 + segments["freq_"+str(i)].iloc[int(seg)-1] = round(trapz(yf[1:],xf[1:]),3) + del xf, yf, seg +del segs, i + +# if features too unbalanced, clustering learns the feature size instead of cluster difference +for i in range(0,segments.shape[1]): + segments.iloc[:,i] = (segments.iloc[:,i] - segments.iloc[:,i].min()) / \ + (segments.iloc[:,i].max() - segments.iloc[:,i].min()) + +#------------------------------------------------------------------------------ +# explore (relation between) features +matrix = segments.corr() # correlation matrix +summary = segments.describe() # summary + +fig, ax = plt.subplots(nrows=1,ncols=2, figsize=(14,6)) +bp = sns.boxplot(segments,ax= ax[0], palette ="viridis",fliersize = 2) +if segments.shape[1] < 10: + ax[1] = sns.heatmap(segments.corr(), cmap = "RdBu",annot = True, fontsize = 6) +else: + ax[1] = sns.heatmap(segments.corr(), cmap = "RdBu",annot = False) +ax[0].set_ylabel("Standardised feature values") +ax[0].set_xlabel("Feature") +ax[0].set_title("Feature distribution") +ax[1].set_title("Feature correlations") +ax[0].set_xticklabels(ax[0].get_xticklabels(),rotation=45) +ax[1].set_xticklabels(ax[1].get_xticklabels(),rotation=45) + + +del i, fig, ax +del bp + + +flist = [] +for idx in range(0,segments.shape[1]): + ad = [segments.columns[idx] for m in settings["user_feat"] if m in segments.columns[idx]] + if len(ad) > 0: + flist.append(ad[0]) + +#------------------------------------------------------------------------------ +# make box plots of features per dimension and per class in "truth" +fig,axes = plt.subplots(ncols = dim, nrows = len(settings["user_feat"]), + figsize = (5*dim,4*len(settings["user_feat"])), sharex = True) +beh = ["walk","cubicle","feed"] +for i in range(len(settings["user_feat"])): + for j in range(dim): + colname = settings["user_feat"][i] + '_' + str(j) + print(colname) + sns.boxplot(x = "truth", y = colname, data = segments, ax = axes[i][j]) + if i == len(settings["user_feat"])-1: + axes[i][j].set_xticklabels(beh,rotation=0) + if i == 0: + axes[i][j].set_title(settings["variables"][j]) + +#------------------------------------------------------------------------------ +# bar plots per behaviour per cow per day (number of segments) +days = segments["day"].drop_duplicates().reset_index(drop=1) +ids = segments["id"].drop_duplicates().reset_index(drop=1) + +# add behaviour to segments +for j in range(len(ids)): + fig, axes = plt.subplots(ncols = 1, + nrows = 2, + figsize = (20,15)) + + subset = segments.loc[segments["id"] == ids[j],["day","truth","id"]] + subset1 = subset.groupby(by = ["day","truth"]).count().reset_index() + subset2 = subset[["day","id"]].groupby(by = ["day"]).count().reset_index() + dfcol = pd.merge(subset1,subset2,how = "outer", on = "day") + dfcol["perc"] = dfcol["id_x"]/dfcol["id_y"]*100 + sns.barplot(data = dfcol, x = "day" , y = "perc", palette = "rocket", hue = "truth", ax = axes[0]) + h, l = axes[0].get_legend_handles_labels() + axes[0].legend(h, beh, title="Behaviour") + axes[0].set_title("cow " + str(round(ids[j]))) + axes[0].set_ylabel("percentage of segments in gold standard") + sns.barplot(data = dfcol, x = "day" , y = "id_x", palette = "rocket", hue = "truth", ax = axes[1]) + h, l = axes[1].get_legend_handles_labels() + axes[1].legend(h, beh, title="Behaviour") + axes[1].set_title("cow " + str(round(ids[j]))) + axes[1].set_ylabel("number of segments in gold standard") + plt.savefig(path_out + "//overview_behaviour_cow_" + str(round(ids[j]))) + plt.close() + +del ids, days + +#%% data split + + +# add ground truth to "segments" data frame +gs_index = data["segment"].drop_duplicates().index.values +segments["truth"] = data[settings["ground_truth"]].iloc[gs_index].values +segments["day"] = data["day"].iloc[gs_index].values +segments["id"] = data[settings["id"]].iloc[gs_index].values +del gs_index + +if "time" in settings["data_split_by"]: + + # for each cow, select the first settings["data_split_days"] days as the training + df = {"train" : segments.loc[segments["day"] <= settings["data_split_days"],:], + "test" : segments.loc[segments["day"] > settings["data_split_days"],:]} + +else: + # use sklearn model selection train_test_split + X_train, X_test = train_test_split(segments, + test_size = 1-(settings["data_partitioning"]/100), + stratify = segments[["id","truth"]] + ) # stratify for behaviours + X_train = X_train.sort_index(axis=0) + X_test = X_test.sort_index(axis=0) + + # put in dictionary + df = {"train" : X_train, + "test" : X_test} + + del X_train, X_test + +#------------------------------------------------------------------------------ +# TODO: explore test and train datasets created -- overall +# TODO: explore test and train datasets created -- per individual id +# !!! set feature list + + + + + + + + + +#%% supervised learning for all ids in the dataset + +from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.naive_bayes import GaussianNB + +model_pipeline = [] +model_pipeline.append(SVC()) +model_pipeline.append(KNeighborsClassifier()) +model_pipeline.append(DecisionTreeClassifier()) +model_pipeline.append(RandomForestClassifier()) +model_pipeline.append(GaussianNB()) + +#Model running and evaluation +from sklearn import metrics +from sklearn.metrics import classification_report +from sklearn.metrics import confusion_matrix + +model_list = ["SVM", "KNN","Decision Tree", "Random Forest", "Naive Bayes"] +acc_list = [] +auc_list = [] +cm_list = [] + + + +for model in model_pipeline: + print(model) + model.fit(df["train"][flist], df["train"]["truth"]) + y_pred = model.predict(df["test"][flist]) + acc_list.append(metrics.accuracy_score(df["test"]["truth"], y_pred)) + #fpr, tpr, _thresholds = metrics.roc_curve(np.array(y_test), y_pred) + #auc_list.append(round(metrics.auc(fpr, tpr), 2)) + cm_list.append(confusion_matrix(df["test"]["truth"], y_pred)) + +#visualise confusion matrix +fig = plt.figure(figsize = (15,10)) +for i in range(len(cm_list)): + print(i) + cm = cm_list[i] + model = model_list[i] + sub = fig.add_subplot(2,3,i+1).set_title(model) + cm_plot = sns.heatmap(cm, annot = True, cmap = 'Blues_r') + cm_plot.set_xlabel('Predicted Values') + cm_plot.set_ylabel('Actual Values') + +# Summarize accuracy +result_df = pd.DataFrame({'Model': model_list, 'Accuracy': acc_list}) + + + +#%% supervised learning per individual id in the dataset + +from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.naive_bayes import GaussianNB + +model_pipeline = [] +model_pipeline.append(SVC()) +model_pipeline.append(KNeighborsClassifier()) +model_pipeline.append(DecisionTreeClassifier()) +model_pipeline.append(RandomForestClassifier()) +model_pipeline.append(GaussianNB()) + +#Model running and evaluation +from sklearn import metrics +from sklearn.metrics import classification_report +from sklearn.metrics import confusion_matrix + +model_list = ["SVM", "KNN","Decision Tree", "Random Forest", "Naive Bayes"] +acc_list = [] +auc_list = [] +cm_list = [] + +# set feature list +flist = [] +for idx in range(0,segments.shape[1]): + ad = [segments.columns[idx] for m in settings["user_feat"] if m in segments.columns[idx]] + if len(ad) > 0: + flist.append(ad[0]) + +for model in model_pipeline: + model.fit(df["train"][flist], df["test"]["truth"]) + y_pred = model.predict(x_test) + acc_list.append(metrics.accuracy_score(y_test, y_pred)) + #fpr, tpr, _thresholds = metrics.roc_curve(np.array(y_test), y_pred) + #auc_list.append(round(metrics.auc(fpr, tpr), 2)) + cm_list.append(confusion_matrix(y_test, y_pred)) + +#visualise confusion matrix +fig = plt.figure(figsize = (15,10)) +for i in range(len(cm_list)): + cm = cm_list[i] + model = model_list[i] + sub = fig.add_subplot(2,3,i+1).set_title(model) + cm_plot = sns.heatmap(cm, annot = True, cmap = 'Blues_r') + cm_plot.set_xlabel('Predicted Values') + cm_plot.set_ylabel('Actual Values') + +# Summarize accuracy +result_df = pd.DataFrame({'Model': model_list, 'Accuracy': acc_list}) diff --git a/tool/overview_behaviour_cow_1091.png b/tool/overview_behaviour_cow_1091.png new file mode 100644 index 0000000000000000000000000000000000000000..024587fccdac51559caba24829966328de7c3fc0 Binary files /dev/null and b/tool/overview_behaviour_cow_1091.png differ diff --git a/tool/overview_behaviour_cow_1681.png b/tool/overview_behaviour_cow_1681.png new file mode 100644 index 0000000000000000000000000000000000000000..ce0a9c783ea857036ee714f650ae7c3a40ebdce2 Binary files /dev/null and b/tool/overview_behaviour_cow_1681.png differ diff --git a/tool/overview_behaviour_cow_3089.png b/tool/overview_behaviour_cow_3089.png new file mode 100644 index 0000000000000000000000000000000000000000..ba6f5ab45f03346df7a8d4a9c9b8d12fd06e1ac8 Binary files /dev/null and b/tool/overview_behaviour_cow_3089.png differ diff --git a/tool/overview_behaviour_cow_3106.png b/tool/overview_behaviour_cow_3106.png new file mode 100644 index 0000000000000000000000000000000000000000..2059e80fc42505ec0d71c3120a4f6e22cb1ce34e Binary files /dev/null and b/tool/overview_behaviour_cow_3106.png differ diff --git a/tool/overview_behaviour_cow_3152.png b/tool/overview_behaviour_cow_3152.png new file mode 100644 index 0000000000000000000000000000000000000000..12b358e73fb9e54e947a8b8d37d4d6e710db94cd Binary files /dev/null and b/tool/overview_behaviour_cow_3152.png differ diff --git a/tool/overview_behaviour_cow_419.png b/tool/overview_behaviour_cow_419.png new file mode 100644 index 0000000000000000000000000000000000000000..594bd9f58fe1db6bd3bc5b51da0b1a0c6abee537 Binary files /dev/null and b/tool/overview_behaviour_cow_419.png differ diff --git a/tool/overview_behaviour_cow_605.png b/tool/overview_behaviour_cow_605.png new file mode 100644 index 0000000000000000000000000000000000000000..082cd00bb22fb0e7b9f5c11ef653844088dcd97e Binary files /dev/null and b/tool/overview_behaviour_cow_605.png differ diff --git a/tool/overview_behaviour_cow_7387.png b/tool/overview_behaviour_cow_7387.png new file mode 100644 index 0000000000000000000000000000000000000000..47576006fe551e94015debc7800e72a30400387b Binary files /dev/null and b/tool/overview_behaviour_cow_7387.png differ diff --git a/tool/overview_behaviour_cow_821.png b/tool/overview_behaviour_cow_821.png new file mode 100644 index 0000000000000000000000000000000000000000..042b599a2ac748239cc9dd2fd0169fe8e51a56eb Binary files /dev/null and b/tool/overview_behaviour_cow_821.png differ diff --git a/tool/segmentation.py b/tool/segmentation.py index 53dd3273c2816254554d3d4ea60913ec87b3516a..43fdd2e7fb0e3a70b8c3af86e8a1a777918fad9b 100644 --- a/tool/segmentation.py +++ b/tool/segmentation.py @@ -103,150 +103,169 @@ import numpy as np import matplotlib.pyplot as plt import ruptures as rpt from scipy.signal import medfilt - -# path -path = os.path.join("C:","/Users","adria036", - "OneDrive - Wageningen University & Research","iAdriaens_doc", - "Projects","iAdriaens","bait","data","preprocessed") +from datetime import timedelta + +# path to data +# path = os.path.join("C:","/Users","adria036", +# "OneDrive - Wageningen University & Research","iAdriaens_doc", +# "Projects","iAdriaens","bait","data","preprocessed") +# path to data SP +path = os.path.join("W:","ASG","WLR_Dataopslag","Genomica","Sustainable_breeding", + "4400003586_BAIT","Dataset_Ines_Marja") + +fns = [f for f in os.listdir(path) \ + if os.path.isfile(os.path.join(path,f)) \ + and ("newseldata_cow_" in f) \ + and ("_2022" not in f)] +fns = pd.DataFrame(fns,columns = ["fn"]) +fns["cowid"] = fns["fn"].iloc[:].str[15:-4].astype(int) # read dataset accelerations # data = pd.read_csv(path+"\\DCdata2.txt", index_col=0) # data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S.%f%z") - # read dataset # data = pd.read_csv(path+"\\DCdata.csv", index_col=0) # data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S.%f%z") # # #data = data[["cowid","at","t","gap","activity"]] -# dataset sheep -data = pd.read_csv(path+"\\HDdata_sheep1_back.csv", index_col=0) -data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S") - -# plot -# fig,ax = plt.subplots(nrows=3,ncols=1,figsize = (20,10),sharex=True) -# data2 = data.loc[data["at"].dt.day==6,:].sort_values(by="at").reset_index(drop=1) -# ax[0].plot(data2["at"],data2["acc_x"],color = "teal") -# ax[1].plot(data2["at"],data2["acc_y"],color = "teal") -# ax[2].plot(data2["at"],data2["acc_z"],color = "teal") -# ax[0].plot(data2["at"],data2["acc_xm"],color = "k") -# ax[1].plot(data2["at"],data2["acc_ym"],color = "k") -# ax[2].plot(data2["at"],data2["acc_zm"],color = "k") - -del path - -#%% segmentation tests -# savepath -path = os.path.join("C:","/Users","adria036", - "OneDrive - Wageningen University & Research","iAdriaens_doc", - "Projects","iAdriaens","bait","results","segmentation") -""" -# define parameters for segmentation - -# set minimum time instead of minimum size and set signal -""" -model = "normal" -min_size = 2*60 # minimum distance between changepoints -dim = 3 -# variable = ["acc_x","acc_y","acc_z"] -variable = ["acc_xm","acc_ym","acc_zm"] -idname = "id" -tname = "at" -medfilter = 0 -# select data and perform segmentation - -allids = data[idname].drop_duplicates() -nseg = 1 -brkpnts = pd.DataFrame([]) -output = pd.DataFrame([]) -for ids in allids: - days = data[tname].dt.day.drop_duplicates() - for day in days: - # select data - signal = data.loc[(data[idname]==ids) & \ - (data[tname].dt.day == day) \ - ,variable] - signal = signal.dropna() - indx = signal.index.values - signal = signal.to_numpy() - - # filter data to remove noise / errors - if (medfilter == 1): - for d in range(0,signal.shape[1]): - mfilt = medfilt(signal[:,d],21) - signal[:,d] = mfilt - - # set penalty values // rule of thumb = log(n)*dim*std(signal) - #pen = np.log(len(signal)) * dim * np.std(signal, axis=0).mean()**2 - pen = np.log(len(signal)) * dim * np.std(signal)**2 - if pen < 700: - pen = 950 - #pen = 950 - print("id = " + str(ids)) - print("pen = " + str(pen)) - print("day = " + str(day)) - # if segment std ~= signal std- this might not work - +# # dataset sheep +# data = pd.read_csv(path+"\\HDdata_sheep1_back.csv", index_col=0) +# data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S") + +# read data SP +for f in fns["fn"]: + print(f) +# fn = "newseldata_cow_280.txt" + data = pd.read_csv(os.path.join(path,f), + usecols = ["cowid","barn","date","t","acc_xs","acc_ys","acc_zs"] ) + data["date"]= pd.to_datetime(data["date"],format = "%Y-%m-%d") + data["datetime"] = data["date"] + \ + (pd.to_datetime(data["t"],unit = 's') - pd.to_datetime("1970-01-01",format = "%Y-%m-%d")) + dcol = "date" + data["day"] = (data["date"] - data["date"].min()).dt.days + + # plot %matplotlib qt ---- this is and example + # fig,ax = plt.subplots(nrows=3,ncols=1,figsize = (20,10),sharex=True) + # data2 = data.loc[data[dcol].dt.day==27,:].sort_values(by="t").reset_index(drop=1) + # ax[0].plot(data2["t"],data2["acc_xs"],color = "lightgrey") + # ax[1].plot(data2["t"],data2["acc_ys"],color = "teal") + # ax[2].plot(data2["t"],data2["acc_zs"],color = "teal") + #ax[0].plot(data2["t"],data2["acc_xs"].rolling(10).median(),color = "k") + #ax[1].plot(data2["t"],data2["acc_ys"].rolling(10).median(),color = "k") + #ax[2].plot(data2["t"],data2["acc_zs"].rolling(10).median(),color = "k") + + # del path + + #%% segmentation tests + # savepath + # path = os.path.join("C:","/Users","adria036", + # "OneDrive - Wageningen University & Research","iAdriaens_doc", + # "Projects","iAdriaens","bait","results","segmentation") + svpath = os.path.join("W:","ASG","WLR_Dataopslag","Genomica","Sustainable_breeding", + "4400003586_BAIT","Dataset_Ines_Marja","segmentation") + """ + # define parameters for segmentation + + # set minimum time instead of minimum size and set signal + """ + model = "normal" + min_size = 2*60 # minimum distance between changepoints = in seconds + dim = 3 + # variable = ["acc_x","acc_y","acc_z"] + variable = ["acc_xs","acc_ys","acc_zs"] + idname = "cowid" + tname = "datetime" + medfilter = 1 + # select data and perform segmentation - # fit and define changepoint model - # algo = rpt.Pelt(model=model,min_size=min_size).fit(signal) - - c = rpt.costs.CostNormal().fit(signal) - algo = rpt.Pelt(custom_cost=rpt.costs.CostNormal(),min_size=min_size).fit(signal) - #algo = rpt.Pelt(model="rbf",min_size=min_size).fit(signal) - cpts = algo.predict(pen = pen) - - # save breakpoints in new variable - df = pd.DataFrame(np.linspace(nseg,nseg+len(cpts)-1,len(cpts)).astype(int),columns=["segment"]) - df["start"] = [0]+cpts[:-1] - df["end"] = cpts[:-1]+[len(signal)] - df["end"] = df["end"]-1 - nseg = nseg + len(cpts) - - # initiate figure - fig, axes = rpt.display(signal, cpts,figsize = (18,9)) - - # add breakpoints / segments to signal - signal = pd.DataFrame(signal,columns = variable) - signal["segment"] = np.nan - signal.iloc[df["start"].values,signal.columns.get_indexer(["segment"])] = df["segment"].values - signal.iloc[df["end"].values,signal.columns.get_indexer(["segment"])] = df["segment"].values - signal = signal.fillna(method="pad") - - signal.index = indx - signal["at"] = data.loc[(data[idname]==ids) & \ - (data[tname].dt.day == day) & \ - (~data[tname].isna()),"at"] - signal["date2"] = signal["at"].dt.strftime("%d/%m %H:%M") - signal["id"] = ids - - # plot - axes[0].set_title("acceleration in x direction") - axes[1].set_title("acceleration in y direction") - axes[2].set_title("acceleration in z direction") - - axes[0].set_ylabel("acceleration [m/s²]") - axes[1].set_ylabel("acceleration [m/s²]") - axes[2].set_ylabel("acceleration [m/s²]") - axes[2].set_xlabel("date") - try: - labels = signal.index.values[0:-1:10000] #-signal.index.values[0] - labels = labels.tolist() - labelnames = signal['date2'].iloc[labels] - plt.xticks(labels,labels=labelnames) - except: - #donothing - print("failed to set date labels") - plt.tight_layout() - plt.savefig(path+"\\add_segm_sheep1back"+str(round(ids))+"_day"+str(round(day))+".tif") - plt.close() + allids = data[idname].drop_duplicates() + nseg = 1 + brkpnts = pd.DataFrame([]) + output = pd.DataFrame([]) + for ids in allids: + days = data["day"].drop_duplicates() + for day in days: + # select data + signal = data.loc[(data[idname]==ids) & \ + (data["day"] == day) \ + ,variable] + signal = signal.dropna() + indx = signal.index.values + signal = signal.to_numpy() + if len(signal) > 3600: - - # add signal to output and add df to brkpnts - output = pd.concat([output,signal]) - brkpnts = pd.concat([brkpnts,df]) - -output.to_csv(path + "\\HD_sheep1back_segmented_2min.csv") -brkpnts.to_csv(path+"\\HD_sheep1back_breakpoints_2min.csv") - -#%% Create outputpath = + # filter data to remove noise / errors + if (medfilter == 1): + for d in range(0,signal.shape[1]): + mfilt = medfilt(signal[:,d],5) + signal[:,d] = mfilt + + # set penalty values // rule of thumb = log(n)*dim*std(signal) + #pen = np.log(len(signal)) * dim * np.std(signal, axis=0).mean()**2 + pen = np.log(len(signal)) * dim * np.std(signal)**2 + # if pen < 700: + # pen = 950 + pen = 350 # for the SP data, this seems a generally good threshold + + print("id = " + str(ids)) + print("pen = " + str(pen)) + print("day = " + str(day)) + # if segment std ~= signal std- this might not work + + + # fit and define changepoint model + # algo = rpt.Pelt(model=model,min_size=min_size).fit(signal) + + c = rpt.costs.CostNormal().fit(signal) + algo = rpt.Pelt(custom_cost=rpt.costs.CostNormal(),min_size=min_size).fit(signal) + #algo = rpt.Pelt(model="rbf",min_size=min_size).fit(signal) + cpts = algo.predict(pen = pen) + + # save breakpoints in new variable + df = pd.DataFrame(np.linspace(nseg,nseg+len(cpts)-1,len(cpts)).astype(int),columns=["segment"]) + df["start"] = [0]+cpts[:-1] + df["end"] = cpts[:-1]+[len(signal)] + df["end"] = df["end"]-1 + nseg = nseg + len(cpts) + + # initiate figure + fig, axes = rpt.display(signal, cpts,figsize = (18,9)) + + # add breakpoints / segments to signal + signal = pd.DataFrame(signal,columns = variable) + signal["segment"] = np.nan + signal.iloc[df["start"].values,signal.columns.get_indexer(["segment"])] = df["segment"].values + signal.iloc[df["end"].values,signal.columns.get_indexer(["segment"])] = df["segment"].values + signal = signal.fillna(method="pad") + + signal.index = indx + signal["at"] = data.loc[(data[idname]==ids) & \ + (data[tname].dt.day == day) & \ + (~data[tname].isna()),dcol] + signal["date2"] = signal["at"].dt.strftime("%d/%m %H:%M") + signal["id"] = ids + + # plot + axes[0].set_title("cow = " + str(int(ids)) + ", day = " +str(day) + "acceleration in x direction") + axes[1].set_title("acceleration in y direction") + axes[2].set_title("acceleration in z direction") + + axes[0].set_ylabel("acceleration [m/s²]") + axes[1].set_ylabel("acceleration [m/s²]") + axes[2].set_ylabel("acceleration [m/s²]") + axes[2].set_xlabel("measurement no.") + + plt.tight_layout() + plt.savefig(svpath+"\\add_segm_cow_"+str(round(ids))+"_day"+str(round(day))+".tif") + plt.close() + + + # add signal to output and add df to brkpnts + output = pd.concat([output,signal]) + brkpnts = pd.concat([brkpnts,df]) + + output.to_csv(svpath + "\\SP_cow_" + str(ids) + "_segmented_2min.txt") + brkpnts.to_csv(svpath+"\\SP_cow_" + str(ids) + "_breakpoints_2min.txt") + +