Skip to content
Snippets Groups Projects
Commit cfef8d3d authored by Adriaens, Ines's avatar Adriaens, Ines :v_tone2:
Browse files

preprocessing and selection data all farms

parent 9356afa8
Branches
No related tags found
No related merge requests found
......@@ -16,7 +16,7 @@ import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib qt
#%matplotlib qt
#%% set filepaths
......@@ -39,7 +39,7 @@ for f in farms.farm:
dani = pd.read_csv(path+"//ani_"+str(f)+".txt", index_col = 0)
dlac = pd.read_csv(path+"//lac_"+str(f)+".txt", index_col = 0)
dscc = pd.read_csv(path+"//scc_"+str(f)+".txt", index_col = 0)
dweather = pd.read_csv(path+"//weather_information.txt", index_col = 0)
# set datetimes to datetimes
dact["measured_on"] = pd.to_datetime(dact["measured_on"],format = "%Y-%m-%d %H:%M:%S")
......@@ -47,7 +47,6 @@ for f in farms.farm:
dmilk["ended_at"] = pd.to_datetime(dmilk["ended_at"],format = "%Y-%m-%d %H:%M:%S")
dani["birth_date"] = pd.to_datetime(dani["birth_date"],format = "%Y-%m-%d %H:%M:%S")
dscc["measured_on"] = pd.to_datetime(dscc["measured_on"],format = "%Y-%m-%d %H:%M:%S")
dweather["datetime"] = pd.to_datetime(dweather["datetime"], format = "%Y-%m-%d %H:%M:%S")
dlac["calving"] = pd.to_datetime(dlac["calving"], format = "%Y-%m-%d %H:%M:%S")
# delete if no tmy data available
......@@ -56,11 +55,11 @@ for f in farms.farm:
# sort milk data and calculate gaps
dmilk = dmilk.sort_values(by = ["animal_id","started_at"]).reset_index(drop=1)
dmilk["gap"] = np.nan
dmilk["gap"][1:] = dmilk["started_at"][1:].values-dmilk["started_at"][:-1].values
dmilk["gap"].iloc[1:] = dmilk["started_at"][1:].values-dmilk["started_at"][:-1].values
dmilk["gap"] = dmilk["gap"].astype(float)/(10**9*3600)
dmilk.loc[dmilk["gap"]<0,"gap"] = np.nan
#------------------------------------------------------------------------------
#TODO: fix the .loc / copy warnings
# get all moments where a new lactation starts (gap of 10 days)
newlac = dmilk.loc[(dmilk.gap>24*10),["animal_id","lactation_id","parity","started_at","gap"]].sort_values(by = ["animal_id","started_at"]).reset_index(drop=1)
......@@ -87,8 +86,8 @@ for f in farms.farm:
sub = pd.DataFrame([])
sub2 = dlac.loc[dlac["animal_id"]==cow,:]
sub2["calving"] = sub2["calving"].dt.date
sub2["parity"] = sub2["parity"].astype(int)
sub2["calving"] = sub2.loc[:,"calving"].dt.date
sub2["parity"] = sub2["parity"].astype("int64")
# add sub2 to sub
sub = pd.concat([sub,sub2])
......@@ -162,7 +161,22 @@ for f in farms.farm:
del sub, sub2, i, dim, new, anew, lacids, act, data, idx, cow
del test, par, new_cows, new_no, newlac
# select lactations for which data from DIM < 5 and > 100 are available
#------------------------------------------------------------------------------
# activity: combine activity per day (sum)
diff = dact["measured_on"]-pd.to_datetime(dact["measured_on"].dt.date.min())
diff = np.floor(diff.astype("int64")/(10**9*24*3600))
dact["day"] = diff.astype(int)
act = dact[["animal_id","activity_total","rumination_acc","rumination_time","day"]].groupby(by=["animal_id","day"]).sum()
act = act.reset_index()
idx = dact[["farm_id","animal_id","lactation_id","day"]].drop_duplicates().index.values
new = dact.iloc[idx,:]
new = new[["farm_id","animal_id","lactation_id","parity","day","measured_on"]]
new2 = new.merge(act, how = "outer", on = ["animal_id","day"])
# remove the first measurement of a new lactation (= duplicated)
new2 = new2.loc[new2[["animal_id","day"]].duplicated()==False,:].reset_index(drop=1)
#------------------------------------------------------------------------------
# select lactations for which data from DIM < 5 and > 75 are available
subset = dmilk[["animal_id","lactation_id","dim","started_at"]].groupby(by = ["animal_id","lactation_id"]).min().reset_index()
subset2 = dmilk[["animal_id","lactation_id","dim","started_at"]].groupby(by = ["animal_id","lactation_id"]).max().reset_index()
subset["enddim"] = subset2["dim"]
......@@ -170,23 +184,47 @@ for f in farms.farm:
subset = subset.rename(columns = {"dim" : "startdim","startdate":"started_at"})
subset = subset.sort_values(by = "startdim")
subset = subset.loc[(subset["startdim"]<=5) & (subset["enddim"]>75),:].reset_index(drop=1)
# end and start date
dfarm = {"startdate" : dact["measured_on"].min() }
# select data from animals in subset
milk = dmilk.merge(subset[["animal_id","lactation_id"]],
how = "inner",on = ["animal_id","lactation_id"])
act = new2.merge(subset[["animal_id","lactation_id"]],
how = "inner",on = ["animal_id","lactation_id"])
scc = dscc.merge(subset[["animal_id","lactation_id"]],
how = "inner",on = ["animal_id","lactation_id"])
# select appropriate weather information
dweather = pd.read_csv(path+"//weather_information.txt", index_col = 0)
dweather["datetime"] = pd.to_datetime(dweather["datetime"], format = "%Y-%m-%d %H:%M:%S")
dfarms = pd.read_csv(path+"//farm_information.txt", index_col = 0)
startdate = milk["started_at"].min()
enddate = milk["started_at"].max()
aws = dfarms.loc[dfarms["farm_id"] == f,"aws_id"].values
wea = dweather.loc[(dweather["aws_id"] == aws[0]) & (dweather["datetime"] > pd.to_datetime(startdate)) & (dweather["datetime"] < pd.to_datetime(enddate)),: ]
#------------------------------------------------------------------------------
# write to csv
milk.to_csv(path+"//farm_" + str(f) + "_milk" + ".txt")
act.to_csv(path+"//farm_" + str(f) + "_act" + ".txt")
wea.to_csv(path+"//farm_" + str(f) + "_wea" + ".txt")
scc.to_csv(path+"//farm_" + str(f) + "_scc" + ".txt")
#---------------------------------- visualisations-----------------------------
fig, ax = plt.subplots(nrows=1,ncols=1, figsize= (15,8))
cow = 290 #200, 179, etc
dset = dmilk.loc[dmilk.animal_id == cow,["animal_id","lactation_id","started_at","dim","tmy","mi","parity","gap"]]
dset["relmy"] = dset["tmy"]/dset["mi"]*3600
sns.relplot(data = dset, x="dim",y="relmy", hue = "parity", palette = sns.color_palette("tab10"))
sns.relplot(data = dset, x="started_at",y="relmy", hue = "parity", palette = sns.color_palette("tab10"))
dmilk.loc[dmilk.animal_id == cow,"mi"]*3600,"o")
ax.set_ylim([0,4])
# fig, ax = plt.subplots(nrows=1,ncols=1, figsize= (15,8))
# cow = 290 #200, 179, etc
# dset = dmilk.loc[dmilk.animal_id == cow,["animal_id","lactation_id","started_at","dim","tmy","mi","parity","gap"]]
# dset["relmy"] = dset["tmy"]/dset["mi"]*3600
# sns.relplot(data = dset, x="dim",y="relmy", hue = "parity", palette = sns.color_palette("tab10"))
# sns.relplot(data = dset, x="started_at",y="relmy", hue = "parity", palette = sns.color_palette("tab10"))
# ax.set_ylim([0,4])
test2 = new.loc[(new["animal_id"]==19)&(new["parity"]==0) & (~new["tmy"].isna()) ,:]
cow == 19
fig, ax = plt.subplots(nrows=1,ncols=1, figsize= (15,8))
ax.plot(test.loc[test.animal_id == cow,"dim"],test.loc[test.animal_id == cow,"tmy"] / \
test.loc[test.animal_id == cow,"mi"]*3600,"o")
ax.set_ylim([0,4])
\ No newline at end of file
# test2 = new.loc[(new["animal_id"]==19)&(new["parity"]==0) & (~new["tmy"].isna()) ,:]
# cow == 19
# fig, ax = plt.subplots(nrows=1,ncols=1, figsize= (15,8))
# ax.plot(test.loc[test.animal_id == cow,"dim"],test.loc[test.animal_id == cow,"tmy"] / \
# test.loc[test.animal_id == cow,"mi"]*3600,"o")
# ax.set_ylim([0,4])
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment