Skip to content
Snippets Groups Projects
Commit 61a02b7c authored by Adriaens, Ines's avatar Adriaens, Ines :v_tone2:
Browse files

Merge branch 'data' into 'main'

Data

See merge request !3
parents a58cfd59 3e5c661b
Branches
No related tags found
1 merge request!3Data
**date**: November 21, 2022
**author**: Ines Adriaens
## Overview tools for changepoint analysis
**Ruptures**
------------
[documentation](https://centre-borelli.github.io/ruptures-docs/)
[user guide](https://centre-borelli.github.io/ruptures-docs/user-guide/)
[code](https://github.com/deepcharles/ruptures)
[manuscript](http://www.laurentoudre.fr/publis/TOG-SP-19.pdf)
- exact and approximate changepoint detection for segmentation of non-stationary signals;
- PELT available (Pruned Exact Linear Time)
- different cost functions (mean, median, mean/cov, linear, etc) available
- not too well documented / explained
- works with multivariate data, but not well documented
- still active
**sdt-python**
--------------
[documentation](https://schuetzgroup.github.io/sdt-python/index.html)
[code](https://github.com/schuetzgroup/sdt-python)
[zenodo, doi](https://zenodo.org/record/7078957#.Y3uN1XbMKUk)
- developed for processing of fluorescence microscopic images
- more restricted use for
- PELT available, also Bayesian offline and online methods
- last update 2020
**Changepy**
------------
[code](https://github.com/ruipgil/changepy)
[R package](https://github.com/rkillick/changepoint/)
- PELT alone implemented
- last update in 2017
- gateway for the R package rKillick cpt
- restricted number of cost functions available, incl. mean, var, meanvar, poisson and exponential
--------------------------------------------------------------
### Choice
**ruptures**
\ No newline at end of file
......@@ -56,16 +56,16 @@ path_sql = os.path.join(
# set path to results folder
path_res = os.path.join("C:","/Users","adria036","OneDrive - Wageningen University & Research",
"iAdriaens_doc","Projects","iAdriaens","bait","scripts",
"bait","data","nlas_ines/"
"iAdriaens_doc","Projects","iAdriaens","bait","data",
"nlas_ines/"
)
# set file name for cowid information
fn = "\Copy_of_Sewio_tags.xlsx"
# set start and end dates
startdate = date(2022,7,16)
enddate = date(2022,7,16)
startdate = date(2022,7,1)
enddate = date(2022,7,10)
# set parameters for the imputation and interpolation
win_med = 31
......@@ -120,36 +120,44 @@ for dd in range(t_interval.days+1):
df = pd.concat([cowids,nsec],axis = 1).drop_duplicates() # gives the indices of unique first
del df["relsec"], df["cowid"]
# gap in acceleration data
data["gap"] = data["at"].diff()
test = data["gap"]*1000
data["gap"] = test.dt.seconds/1000
del test
data.loc[data["gap"] < 0,"gap"] = np.nan # set to nan when different cowid
# innerjoin for selection step - unique seconds in the dataset
data = pd.concat([data,df],axis = 1, join = 'inner')
data = data.sort_values(by = ["cowid","at"]).reset_index(drop=1)
data_pos = pd.concat([data,df],axis = 1, join = 'inner')
data_pos["OID"] = data_pos.index.values
data_pos = data_pos.sort_values(by = ["cowid","at"]).reset_index(drop=1)
# unfiltered data - one per second or less, all cows included
# calculate gaps based on numeric time relative to start of dataset
data["gap"] = data["relsec"].diff()
data.loc[data["gap"] < 0,"gap"] = np.nan # set to nan when different cowid
data_pos["gap"] = data_pos["relsec"].diff()
data_pos.loc[data["gap"] < 0,"gap"] = np.nan # set to nan when different cowid
#--------------------------edit barn edges---------------------------------
# set data outside edges to edges x < 0 and < 10.6 or x < 21 and x > 32
data.loc[(data["X"] < 15),"X"] = filter_barn_edges( \
copy.copy(data.loc[(data["X"] < 15),"X"]),0,10.6) #barn 72
data.loc[(data["X"] >= 15),"X"] = filter_barn_edges( \
copy.copy(data.loc[(data["X"] >= 15),"X"]),21,32) # barn 70
data_pos.loc[(data_pos["X"] < 15),"X"] = filter_barn_edges( \
copy.copy(data_pos.loc[(data_pos["X"] < 15),"X"]),0,10.6) #barn 72
data_pos.loc[(data_pos["X"] >= 15),"X"] = filter_barn_edges( \
copy.copy(data_pos.loc[(data_pos["X"] >= 15),"X"]),21,32) # barn 70
# set data outside edges to edges y < -18 or y > -2.5
data["y"] = filter_barn_edges(copy.copy(data["y"]),-18,-2.5)
data_pos["y"] = filter_barn_edges(copy.copy(data["y"]),-18,-2.5)
#data["X"].hist()
#data["y"].hist()
# delete rows without cowid
data = data.loc[data["cowid"]>0,:]
data_pos = data_pos.loc[data["cowid"]>0,:]
#-----------------------filter & interpolate per cow-----------------------
# filter the (X,y) time series with a median filter and interpolate
# cows in the barn
cows = data["cowid"].drop_duplicates().sort_values().reset_index(drop=1)
cows = data_pos["cowid"].drop_duplicates().sort_values().reset_index(drop=1)
cows = pd.DataFrame(cows)
# loop over all data per cowid and select (x,y,t) data to filter
......@@ -164,9 +172,10 @@ for dd in range(t_interval.days+1):
"area" : []})
for i in range(0,len(cows)):
# select data
cow_t = data.loc[(data["cowid"] == cows["cowid"][i]),"relsec"]
cow_x = data.loc[(data["cowid"] == cows["cowid"][i]),"X"]
cow_y = data.loc[(data["cowid"] == cows["cowid"][i]),"y"]
cow_OID = data_pos.loc[(data_pos["cowid"] == cows["cowid"][i]),["OID","relsec"]]
cow_t = data_pos.loc[(data_pos["cowid"] == cows["cowid"][i]),"relsec"]
cow_x = data_pos.loc[(data_pos["cowid"] == cows["cowid"][i]),"X"]
cow_y = data_pos.loc[(data_pos["cowid"] == cows["cowid"][i]),"y"]
# filter data
if len(cow_t) > 10:
......@@ -181,8 +190,7 @@ for dd in range(t_interval.days+1):
cow_t = pd.concat([pd.Series(cow_t.iloc[0]//86400*86400),cow_t], axis=0)
cow_x = pd.concat([pd.Series(np.nan),cow_x], axis=0)
cow_y = pd.concat([pd.Series(np.nan),cow_y], axis=0)
# TODO: add accelerations
# filter
df,x,y = filter_interp(cow_t,cow_x,cow_y,win_med,gap_thres)
......@@ -199,6 +207,9 @@ for dd in range(t_interval.days+1):
df["cowid"] = cows["cowid"][i]
df["date"] = str(startdate+dd)
df = df[["cowid","date","t","gap","X","y","xnew","ynew","area"]]
df["t"] = df["t"].astype(int)
df = pd.merge(df,cow_OID,how="outer",left_on = "t",right_on = "relsec")
df.loc[df["xnew"].isna(),"area"] = np.nan
else:
df = pd.DataFrame({"cowid":cows["cowid"][i],
"date":str(startdate+dd),
......@@ -208,15 +219,24 @@ for dd in range(t_interval.days+1):
"y" : pd.Series(np.arange(0,86400,1)*np.nan),
"xnew" : pd.Series(np.arange(0,86400,1)*np.nan),
"ynew" : pd.Series(np.arange(0,86400,1)*np.nan),
"area" : pd.Series(np.arange(0,86400,1)*np.nan)})
"area" : pd.Series(np.arange(0,86400,1)*np.nan),
"relsec" : pd.Series(np.arange(0,86400,1)*np.nan),
"OID" : pd.Series(np.arange(0,86400,1)*np.nan)})
# concat dataframes all cows together
result = pd.concat([result,df])
# concatenate result with the accelation data on "OID"
data["OID"] = data.index.values
result = pd.merge(data,result[["t","xnew","ynew","area","relsec","OID"]],
how="outer",
left_on = ["OID","relsec"],
right_on = ["OID","relsec"])
# save results
datestr = str(startdate+dd)
datestr = datestr.replace("-","")
fns = datestr + '_bec3.txt'
fns = datestr + '_bait.txt'
result.to_csv(path_res + fns, index = False)
# clear workspace and memory
......
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 15 09:06:43 2022
@author: adria036
"""
import os
os.chdir(r"C:\Users\adria036\OneDrive - Wageningen University & Research\iAdriaens_doc\Projects\iAdriaens\bait\scripts\bait\datapreparation\ines")
#%% import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# define functions
def accel_act(df,window):
"""
combine the accelarations in the x,y and z dimension into a general activity
paramter.
Step 1 : rescale values in each direction
Step 2 : set level of activtiy based on amplitude in preset window
Step 3 : combine activity level in 3 directions
df: Pandas DataFrame with columns 'acc_x', 'acc_y', 'acc_z','id', 'at'
window:
"""
svpath = os.path.join("C:","/Users","adria036",
"OneDrive - Wageningen University & Research","iAdriaens_doc",
"Projects","iAdriaens","bait","results")
try:
df = df.rename(columns = {"cowid":"id"})
namechange = 1
except:
namechange = 0
# calculate activity
ids = df["id"].drop_duplicates().reset_index(drop=1)
output = pd.DataFrame([])
for subj in ids:
print("Calculating activity for subj = " + str(int(subj)))
sset = df.loc[(df["id"]==subj) & \
(~df["acc_x"].isna()) & \
(~df["acc_y"].isna()) & \
(~df["acc_z"].isna()),:].copy()
# standardise and scale measurements
sset["acc_xs"] = (sset["acc_x"]-sset["acc_x"].min())/(sset["acc_x"].max()-sset["acc_x"].min())
sset["acc_ys"] = (sset["acc_y"]-sset["acc_y"].min())/(sset["acc_y"].max()-sset["acc_y"].min())
sset["acc_zs"] = (sset["acc_z"]-sset["acc_z"].min())/(sset["acc_z"].max()-sset["acc_z"].min())
# set datetime to days, seconds, microseconds
sset["d"] = sset["at"].dt.day-sset["at"].dt.day.min()
sset["s"] = sset["at"].dt.hour*3600 \
+ sset["at"].dt.minute*60 \
+ sset["at"].dt.second
# sset["ms"] = sset["at"].dt.microsecond/100
# accumulated activity calculated in window: set window index
sset["win"] = np.floor(sset["s"]/window)
# accumulate activity in set windowsize - operation = std
act = sset[["d","win","acc_xs","acc_ys","acc_zs"]].groupby(by=["d","win"]).std()
act["acc_xs"] = act["acc_xs"].fillna(act["acc_xs"].mean()) # if only 1 measurement std= nan
act["acc_ys"] = act["acc_ys"].fillna(act["acc_ys"].mean()) # replace by mean std
act["acc_zs"] = act["acc_zs"].fillna(act["acc_zs"].mean())
act["activity"] = (act["acc_xs"]+act["acc_ys"]+act["acc_zs"])/3
act = act.reset_index()
sset = pd.merge(sset,act[["d","win","activity"]],on = ["d","win"])
# plot activity, standardized activity and time series of result
# fig,ax = plt.subplots(nrows=1,ncols=3,figsize = (20,10))
# ax[0].hist(sset["acc_x"],density=True,color = "g")
# ax[1].hist(sset["acc_y"],density=True,color = "g")
# ax[2].hist(sset["acc_z"],density=True,color = "g")
# fig,ax = plt.subplots(nrows=1,ncols=3,figsize = (20,10))
# ax[0].hist(sset["acc_xs"],density=True,color = "g")
# ax[1].hist(sset["acc_ys"],density=True,color = "g")
# ax[2].hist(sset["acc_zs"],density=True,color = "g")
fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,10))
T = 0
for d in act["d"].drop_duplicates():
ax.plot(act.loc[act["d"]==d,"win"],act.loc[act["d"]==d,"activity"]+T)
T+=0.5
ax.set_title("activity for subject " + str(int(subj)))
ax.set_xlabel("time of the day, step = " + str(window) + "s")
ax.set_ylabel("activity per day")
plt.savefig(svpath + "\\activity_" + str(int(subj))+ ".tif")
# combine data from all cows
output = pd.concat([output,sset],axis = 0)
# change name back to initial name
if namechange == 1:
output = output.rename(columns = {"id":"cowid"})
return output
#%% prepare dataset 1: accelerations (x,y,z) DC cows
path = os.path.join("C:","/Users","adria036",
"OneDrive - Wageningen University & Research","iAdriaens_doc",
"Projects","iAdriaens","bait","data","nlas_ines")
svpath = os.path.join("C:","/Users","adria036",
"OneDrive - Wageningen University & Research","iAdriaens_doc",
"Projects","iAdriaens","bait","results")
fn = "\\acceleration_dairy_campus.tif"
# read data
data = pd.DataFrame([])
for f in os.listdir(path) :
if ("bait" in f):
print(f)
new = pd.read_csv(path+"\\"+f,usecols=[0,4,7,8,9,10,11,14,16,18,19,20])
new["at"] = pd.to_datetime(new["at"],format = "%Y-%m-%d %H:%M:%S.%f%z")
# select animals list of cows with single sensor in barn 72 (no replacements)
cows = pd.DataFrame([3152,1681,821,3106,419,3089,7387,1091,605],columns = ["cowid"])
new = pd.merge(new,cows,how="inner")
# add to previous dataset
data = pd.concat([data,new],axis=0)
del new, path, cows, f
# visualisation of data: acceleration in x,y,z
plt.rc('font', size=14)
fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,18))
T = 0
yt = [T]
for ID in data["cowid"].drop_duplicates():
ax.plot(data.loc[data["cowid"]==ID,"t"],data.loc[data["cowid"]==ID,"acc_x"]+T,linewidth=0.25,color="steelblue")
ax.plot(data.loc[data["cowid"]==ID,"t"],data.loc[data["cowid"]==ID,"acc_y"]+T,linewidth=0.25,color="teal")
ax.plot(data.loc[data["cowid"]==ID,"t"],data.loc[data["cowid"]==ID,"acc_z"]+T,linewidth=0.25,color="limegreen")
ax.plot(data.loc[data["cowid"]==ID,"t"],T*np.ones(len(data.loc[data["cowid"]==ID,"t"])),linewidth=0.25,color="k")
T+=data[["acc_x","acc_y","acc_z"]].max().max()+1
yt.append(round(T))
ax.set_title("activity [m/s²]")
ax.set_xlabel("t [s]")
ax.set_ylabel("acceleration per cowid")
ax.legend(["acc_x","acc_y","acc_z"])
IDS = [str(ID) for ID in data["cowid"].drop_duplicates().astype(int).to_list()]
ax.set_yticks(yt[:-1],labels=IDS)
plt.savefig(svpath + fn)
del svpath, fn, fig, ax
# calculate general activity level
df = data[["cowid","acc_x","acc_y","acc_z","at"]]
window = 60 # in seconds
df2 = accel_act(df,window)
# add activity to data
DCdata = pd.merge(data,df2[["cowid","at","window","activity"]],on = ["cowid","at"],how="outer")
del window, df, df2, data
# save to csv
path = os.path.join("C:","/Users","adria036",
"OneDrive - Wageningen University & Research","iAdriaens_doc",
"Projects","iAdriaens","bait","data","preprocessed")
DCdata.to_csv(path+"\\DCdata.txt")
del path
# reformat as preferred for segmentation module (strip)
#%% prepare dataset 2: accelerations (x,y,z)
path = os.path.join("C:","/Users","adria036",
"OneDrive - Wageningen University & Research","iAdriaens_doc",
"Projects","iAdriaens","bait","data","nlas_harmen")
svpath = os.path.join("C:","/Users","adria036",
"OneDrive - Wageningen University & Research","iAdriaens_doc",
"Projects","iAdriaens","bait","results")
fn = "\\acceleration_harmen.tif"
refdate = pd.to_datetime("today")
# read data
data = pd.DataFrame([])
ID = 1
for f in os.listdir(path):
print(f)
# read data and reformat
new = pd.read_csv(path+"\\"+f,header = None)
new.columns = ["acc_x","acc_y","acc_z"]
new["id"] = ID
ID += 1
new["t"] = range(0,len(new))
ts = np.linspace(0,len(new),len(new))* pd.Timedelta(40,"milli")
new["at"] = refdate + ts
new = new[["id","at","t","acc_x","acc_y","acc_z"]]
# add to previous dataset
data = pd.concat([data,new],axis=0)
del f, new, ID, ts, refdate, path
# visualise data - acceleration in x,y,z
fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,10))
T = 0
for ID in data["id"].drop_duplicates():
ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_x"]+T,linewidth=0.25,color="steelblue")
ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_y"]+T,linewidth=0.25,color="teal")
ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_z"]+T,linewidth=0.25,color="limegreen")
T+=data[["acc_x","acc_y","acc_z"]].max().max()+1
ax.set_title("activity")
ax.set_xlabel("t")
ax.set_ylabel("acceleration")
ax.legend(["acc_x","acc_y","acc_z"])
plt.savefig(svpath + fn)
del T, fig,ax
# calculate general activity level
df = data[["id","acc_x","acc_y","acc_z","at"]]
window = 60 # in seconds
df2 = accel_act(df,window)
# add activity to data
HDdata = pd.merge(data,df2[["id","at","window","activity"]],on = ["id","at"],how="outer")
# clear workspace
del window, df, df2, data
del svpath, fn
# save to csv
path = os.path.join("C:","/Users","adria036",
"OneDrive - Wageningen University & Research","iAdriaens_doc",
"Projects","iAdriaens","bait","data","preprocessed")
HDdata.to_csv(path+"\\HDdata.txt")
del path
#%% reformat as preferred for segmentation module (strip)
\ No newline at end of file
......@@ -446,8 +446,9 @@ def read_ids(path, fn):
# fn = "\Sewio_tags.xlsx"
# read tagIDs and tags2cow from excel file
tag2cow = pd.read_excel(path+fn, sheet_name = "Tag2Cow")
tagIDs = pd.read_excel(path+fn, sheet_name = "Tag_IDs")
tag2cow = pd.read_excel(path+fn, sheet_name = "Tag2Cow", skiprows = [0,1])
tagIDs = pd.read_excel(path+fn, sheet_name = "Tag_IDs", skiprows = [0,1,2,3])
tagIDs = tagIDs.rename(columns = {'Sewio_alias' : 'Sewio_tagnr'})
# combine tag with tag id and inspect
tags = tag2cow.merge(tagIDs)
tags.head(20)
......
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 15 09:06:43 2022
@author: adria036
"""
import os
os.chdir(r"C:\Users\adria036\OneDrive - Wageningen University & Research\iAdriaens_doc\Projects\iAdriaens\bait\scripts\bait\datapreparation\ines")
#%% import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date,timedelta
# define functions
def accel_act(df,window):
"""
combine the accelarations in the x,y and z dimension into a general activity
paramter.
Step 1 : rescale values in each direction
Step 2 : set level of activtiy based on amplitude in preset window
Step 3 : combine activity level in 3 directions
df: Pandas DataFrame with columns 'acc_x', 'acc_y', 'acc_z','id', 'at'
window:
"""
try:
df = df.rename(columns = {"cowid":"id"})
namechange = 1
except:
namechange = 0
# calculate activity
ids = df["id"].drop_duplicates().reset_index(drop=1)
output = pd.DataFrame([])
for subj in ids:
print("Calculating activity for subj = " + str(int(subj)))
sset = df.loc[(df["id"]==subj) & \
(~df["acc_x"].isna()) & \
(~df["acc_y"].isna()) & \
(~df["acc_z"].isna()),:].copy()
# standardise and scale measurements
sset["acc_xs"] = (sset["acc_x"]-sset["acc_x"].min())/(sset["acc_x"].max()-sset["acc_x"].min())
sset["acc_ys"] = (sset["acc_y"]-sset["acc_y"].min())/(sset["acc_y"].max()-sset["acc_y"].min())
sset["acc_zs"] = (sset["acc_z"]-sset["acc_z"].min())/(sset["acc_z"].max()-sset["acc_z"].min())
# set datetime to days, seconds, microseconds
sset["d"] = sset["at"].dt.day-sset["at"].dt.day.min()
sset["s"] = sset["at"].dt.hour*3600 \
+ sset["at"].dt.minute*60 \
+ sset["at"].dt.second
# sset["ms"] = sset["at"].dt.microsecond/100
# accumulated activity calculated in window: set window index
sset["win"] = np.floor(sset["s"]/window)
# accumulate activity in set windowsize - operation = std
act = sset[["d","win","acc_xs","acc_ys","acc_zs"]].groupby(by=["d","win"]).std()
act["acc_xs"] = act["acc_xs"].fillna(act["acc_xs"].mean()) # if only 1 measurement std= nan
act["acc_ys"] = act["acc_ys"].fillna(act["acc_ys"].mean()) # replace by mean std
act["acc_zs"] = act["acc_zs"].fillna(act["acc_zs"].mean())
act["activity"] = (act["acc_xs"]+act["acc_ys"]+act["acc_zs"])/3
act = act.reset_index()
sset = pd.merge(sset,act[["d","win","activity"]],on = ["d","win"])
# plot activity, standardized activity and time series of result
fig,ax = plt.subplots(nrows=1,ncols=3,figsize = (20,10))
ax[0].hist(sset["acc_x"],density=True,color = "g")
ax[1].hist(sset["acc_y"],density=True,color = "g")
ax[2].hist(sset["acc_z"],density=True,color = "g")
fig,ax = plt.subplots(nrows=1,ncols=3,figsize = (20,10))
ax[0].hist(sset["acc_xs"],density=True,color = "g")
ax[1].hist(sset["acc_ys"],density=True,color = "g")
ax[2].hist(sset["acc_zs"],density=True,color = "g")
fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,10))
T = 0
for d in act["d"].drop_duplicates():
ax.plot(act.loc[act["d"]==d,"win"],act.loc[act["d"]==d,"activity"]+T)
T+=0.5
ax.set_title("activity for subject " + str(subj))
ax.set_xlabel("time of the day, window = " + str(window) + "s")
ax.set_ylabel("activity")
# combine data from all cows
output = pd.concat([output,sset],axis = 0)
# change name back to initial name
if namechange == 0:
output = output.rename(columns = {"id":"cowid"})
return output
#%% prepare dataset 1: accelerations (x,y,z) DC cows
path = os.path.join("C:","/Users","adria036",
"OneDrive - Wageningen University & Research","iAdriaens_doc",
"Projects","iAdriaens","bait","data","nlas_ines")
# read data
data = pd.DataFrame([])
for f in os.listdir(path) :
if ("bait" in f):
print(f)
new = pd.read_csv(path+"\\"+f,usecols=[0,4,7,8,9,10,11,14,16,18,19,20])
new["at"] = pd.to_datetime(new["at"],format = "%Y-%m-%d %H:%M:%S.%f%z")
# select animals list of cows with single sensor in barn 72 (no replacements)
cows = pd.DataFrame([3152,1681,821,3106,419,3089,7387,1091,605],columns = ["cowid"])
new = pd.merge(new,cows,how="inner")
# add to previous dataset
data = pd.concat([data,new],axis=0)
del new, path, cows, f
# visualisation of data: acceleration in x,y,z
fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,18))
T = 0
for ID in data["cowid"].drop_duplicates():
ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_x"]+T,linewidth=0.25,color="steelblue")
ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_y"]+T,linewidth=0.25,color="teal")
ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_z"]+T,linewidth=0.25,color="limegreen")
T+=data[["acc_x","acc_y","acc_z"]].max().max()+1
ax.set_title("activity")
ax.set_xlabel("t")
ax.set_ylabel("acceleration")
ax.legend(["acc_x","acc_y","acc_z"])
# calculate general activity level
df = data[["cowid","acc_x","acc_y","acc_z","at"]]
window = 60 # in seconds
df2 = accel_act(df,window)
# add activity to data
data = pd.merge(data,df2[["cowid","at","activity"]],on = ["cowid","at"],how="outer")
del window, df, df2
# reformat as preferred for segmentation module (strip)
#%% prepare dataset 2: accelerations (x,y,z)
path = os.path.join("C:","/Users","adria036",
"OneDrive - Wageningen University & Research","iAdriaens_doc",
"Projects","iAdriaens","bait","data","nlas_harmen")
refdate = pd.to_datetime("today")
# read data
data = pd.DataFrame([])
ID = 1
for f in os.listdir(path):
print(f)
# read data and reformat
new = pd.read_csv(path+"\\"+f,header = None)
new.columns = ["acc_x","acc_y","acc_z"]
new["id"] = ID
ID += 1
new["t"] = range(0,len(new))
ts = np.linspace(0,len(new),len(new))* pd.Timedelta(40,"milli")
new["at"] = refdate + ts
new = new[["id","at","t","acc_x","acc_y","acc_z"]]
# add to previous dataset
data = pd.concat([data,new],axis=0)
del f, new, path, ID, ts, refdate
# visualise data - acceleration in x,y,z
fig,ax = plt.subplots(nrows=1,ncols=1,figsize = (20,10))
T = 0
for ID in data["id"].drop_duplicates():
ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_x"]+T,linewidth=0.25,color="steelblue")
ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_y"]+T,linewidth=0.25,color="teal")
ax.plot(data.loc[data["id"]==ID,"t"],data.loc[data["id"]==ID,"acc_z"]+T,linewidth=0.25,color="limegreen")
T+=data[["acc_x","acc_y","acc_z"]].max().max()+1
ax.set_title("activity")
ax.set_xlabel("t")
ax.set_ylabel("acceleration")
ax.legend(["acc_x","acc_y","acc_z"])
del T, fig,ax
# calculate general activity level
df = data[["id","acc_x","acc_y","acc_z","at"]]
window = 60 # in seconds
df2 = accel_act(df,window)
# add activity to data
data = pd.merge(data,df2[["id","at","activity"]],on = ["id","at"],how="outer")
# clear workspace
del window, df, df2
#%% reformat as preferred for segmentation module (strip)
\ No newline at end of file
# packages to be added
ruptures=1.1.7
pandas=1.4.3
matplotlib=3.5.2
numpy=1.22.3
openpyxl=3.0.10
python-snappy=0.6.0
spyder=5.1.5
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 22 13:56:40 2022
@author: adria036
-------------------------------------------------------------------------------
Segmentation module of BAIT
- uses ruputures package;
- aims at segmenting one or multiple time series of (sensor) data in different
parts with different statistiacal characteristics. the features of this
segment can then be used to classify the a period of time into different
behaviours;
- depending on the behaviour of interest, a cost function of the segmentation
is chosen; based on this choice, segmentation of the TS is done based on, for
example, the average, standard deviation, linear path, etc. of each segment;
- the input can be a single ts of measurements, or it can be combined into a
cost based on several ts together (e.g. x, y, z), potentially with a different
weight;
- we recommend to use PELT (Pruned Linear Exact Time) as the search algorithm
for the change points. PELT has a linear computational cost O(t), i.e., linear
to the amount of measurements in the time series
- because of the cost of a search algorithm, larger series will have cost more
time to segment;
- typically the exact amount of changes remains unknown, but can be guestimated
with the knowledge on the specific target behaviours. It is recommended to
estimate the amount of changes rather broadly, as this allows the classifier
to decide for each segment whether or not the behaviour has changed;
- alternatively, a penalty function for adding a changepoint can be applied to
determine the amount of changes in the ts. However, when the variance across
behaviours is different, it typically is hard to rely on such penalty, as
the cost function minimization will optimize to changepoints in the area of
the largest variability;
- application of a minimal distance threshold between two changepoints can also
be interesting; knowing that typically a
- standardisation and application of a smoothing can always be of interest to
mathematically optimize the search and avoid bias / local convergence caused
by heteroscedasticity in the data
-------------------------------------------------------------------------------
General (mathematical) remarks on changepoint analysis
- commonly used is maximum likelihood estimation for change detection, in which
the signal is modelled by variables with a piecewise constant distribution.
In this setting, changepoints detection equals maximum likelihood estimation
with the sum of the cost equal to the negative log-likelihood. The distributions
need to be known (prior knowledge) and the model is either a change in mean or
a change in mean and scale (not with real heteroscedastic data: mean and std
multiplied with the same factor?)
-------------------------------------------------------------------------------
RUPTURES - COST FUNCTIONS
RUPTURES - BASE FUNCTIONS
* error(start,end) -- returns the cost on segment (start,end)
* fit(*args,**kwargs) -- set parameters of the
"""
import os
os.chdir(r"C:\Users\adria036\OneDrive - Wageningen University & Research\iAdriaens_doc\Projects\iAdriaens\bait\scripts\bait\datapreparation\ines")
#%% filepaths, constants and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ruptures as rpt
# path
path = os.path.join("C:","/Users","adria036",
"OneDrive - Wageningen University & Research","iAdriaens_doc",
"Projects","iAdriaens","bait","data","preprocessed")
# read dataset
data = pd.read_csv(path+"\\DCdata.csv", index_col=0)
data["at"] = pd.to_datetime(data["at"],format = "%Y-%m-%d %H:%M:%S.%f%z")
del path
#%% segmentation tests
# define parameters for segmentation
model = "l1"
min_size = 5 # minimum distance between changepoints
data =
#
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment