preprocessing and selection data all farms

cfef8d3d · Adriaens, Ines · 9356afa8 · cfef8d3d
Commit cfef8d3d authored 2 years ago by Adriaens, Ines
--- a/preprocessing.py
+++ b/preprocessing.py
@@ -16,7 +16,7 @@ import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns

-%matplotlib qt
+#%matplotlib qt

 #%% set filepaths

@@ -39,7 +39,7 @@ for f in farms.farm:
    dani = pd.read_csv(path+"//ani_"+str(f)+".txt", index_col = 0)
    dlac = pd.read_csv(path+"//lac_"+str(f)+".txt", index_col = 0)
    dscc = pd.read_csv(path+"//scc_"+str(f)+".txt", index_col = 0)
-    dweather = pd.read_csv(path+"//weather_information.txt", index_col = 0)
+
    
    # set datetimes to datetimes
    dact["measured_on"] = pd.to_datetime(dact["measured_on"],format = "%Y-%m-%d %H:%M:%S")
@@ -47,7 +47,6 @@ for f in farms.farm:
    dmilk["ended_at"] = pd.to_datetime(dmilk["ended_at"],format = "%Y-%m-%d %H:%M:%S")
    dani["birth_date"] = pd.to_datetime(dani["birth_date"],format = "%Y-%m-%d %H:%M:%S")
    dscc["measured_on"] = pd.to_datetime(dscc["measured_on"],format = "%Y-%m-%d %H:%M:%S")
-    dweather["datetime"] = pd.to_datetime(dweather["datetime"], format = "%Y-%m-%d %H:%M:%S")
    dlac["calving"] = pd.to_datetime(dlac["calving"], format = "%Y-%m-%d %H:%M:%S")
    
    # delete if no tmy data available
@@ -56,11 +55,11 @@ for f in farms.farm:
    # sort milk data and calculate gaps
    dmilk = dmilk.sort_values(by = ["animal_id","started_at"]).reset_index(drop=1)
    dmilk["gap"] = np.nan
-    dmilk["gap"][1:] = dmilk["started_at"][1:].values-dmilk["started_at"][:-1].values
+    dmilk["gap"].iloc[1:] = dmilk["started_at"][1:].values-dmilk["started_at"][:-1].values
    dmilk["gap"] = dmilk["gap"].astype(float)/(10**9*3600)
    dmilk.loc[dmilk["gap"]<0,"gap"] = np.nan
    
-    
+#------------------------------------------------------------------------------    
    #TODO: fix the .loc / copy warnings
    # get all moments where a new lactation starts (gap of 10 days)
    newlac = dmilk.loc[(dmilk.gap>24*10),["animal_id","lactation_id","parity","started_at","gap"]].sort_values(by = ["animal_id","started_at"]).reset_index(drop=1)
@@ -87,8 +86,8 @@ for f in farms.farm:
            sub = pd.DataFrame([])
        
        sub2 = dlac.loc[dlac["animal_id"]==cow,:]
-        sub2["calving"] = sub2["calving"].dt.date
-        sub2["parity"] = sub2["parity"].astype(int)
+        sub2["calving"] = sub2.loc[:,"calving"].dt.date
+        sub2["parity"] = sub2["parity"].astype("int64")
        
        # add sub2 to sub
        sub = pd.concat([sub,sub2])
@@ -162,7 +161,22 @@ for f in farms.farm:
    del sub, sub2, i, dim, new, anew, lacids, act, data, idx, cow    
    del test, par, new_cows, new_no, newlac 

-    # select lactations for which data from DIM < 5 and > 100 are available
+#------------------------------------------------------------------------------
+    # activity: combine activity per day (sum)
+    diff = dact["measured_on"]-pd.to_datetime(dact["measured_on"].dt.date.min())
+    diff = np.floor(diff.astype("int64")/(10**9*24*3600))
+    dact["day"] = diff.astype(int)
+    act = dact[["animal_id","activity_total","rumination_acc","rumination_time","day"]].groupby(by=["animal_id","day"]).sum()
+    act = act.reset_index()
+    idx = dact[["farm_id","animal_id","lactation_id","day"]].drop_duplicates().index.values
+    new = dact.iloc[idx,:]
+    new = new[["farm_id","animal_id","lactation_id","parity","day","measured_on"]]
+    new2 = new.merge(act, how = "outer", on = ["animal_id","day"])
+    # remove the first measurement of a new lactation (= duplicated)
+    new2 = new2.loc[new2[["animal_id","day"]].duplicated()==False,:].reset_index(drop=1)
+    
+#------------------------------------------------------------------------------
+    # select lactations for which data from DIM < 5 and > 75 are available
    subset = dmilk[["animal_id","lactation_id","dim","started_at"]].groupby(by = ["animal_id","lactation_id"]).min().reset_index()
    subset2 = dmilk[["animal_id","lactation_id","dim","started_at"]].groupby(by = ["animal_id","lactation_id"]).max().reset_index()    
    subset["enddim"] = subset2["dim"]
@@ -170,23 +184,47 @@ for f in farms.farm:
    subset = subset.rename(columns = {"dim" : "startdim","startdate":"started_at"})
    subset = subset.sort_values(by = "startdim")
    subset = subset.loc[(subset["startdim"]<=5) & (subset["enddim"]>75),:].reset_index(drop=1)
-        
-    # end and start date
-    dfarm = {"startdate" : dact["measured_on"].min() }
    
+    # select data from animals in subset
+    milk = dmilk.merge(subset[["animal_id","lactation_id"]],
+                       how = "inner",on = ["animal_id","lactation_id"])
+    act = new2.merge(subset[["animal_id","lactation_id"]],
+                       how = "inner",on = ["animal_id","lactation_id"]) 
+    scc = dscc.merge(subset[["animal_id","lactation_id"]],
+                       how = "inner",on = ["animal_id","lactation_id"]) 
+    
+    # select appropriate weather information
+    dweather = pd.read_csv(path+"//weather_information.txt", index_col = 0)
+    dweather["datetime"] = pd.to_datetime(dweather["datetime"], format = "%Y-%m-%d %H:%M:%S")
+    dfarms = pd.read_csv(path+"//farm_information.txt", index_col = 0)
+    startdate = milk["started_at"].min()
+    enddate = milk["started_at"].max()
+    aws = dfarms.loc[dfarms["farm_id"] == f,"aws_id"].values
+    wea = dweather.loc[(dweather["aws_id"] == aws[0]) & (dweather["datetime"] > pd.to_datetime(startdate)) & (dweather["datetime"] < pd.to_datetime(enddate)),: ]
+
+#------------------------------------------------------------------------------
+    # write to csv
+    milk.to_csv(path+"//farm_" + str(f) + "_milk" + ".txt")
+    act.to_csv(path+"//farm_" + str(f) + "_act" + ".txt")
+    wea.to_csv(path+"//farm_" + str(f) + "_wea" + ".txt")
+    scc.to_csv(path+"//farm_" + str(f) + "_scc" + ".txt")
+    
+    
+
+
+#---------------------------------- visualisations-----------------------------
    
-    fig, ax = plt.subplots(nrows=1,ncols=1, figsize= (15,8))
-    cow = 290  #200, 179, etc
-    dset = dmilk.loc[dmilk.animal_id == cow,["animal_id","lactation_id","started_at","dim","tmy","mi","parity","gap"]]
-    dset["relmy"] = dset["tmy"]/dset["mi"]*3600
-    sns.relplot(data = dset, x="dim",y="relmy", hue = "parity", palette = sns.color_palette("tab10"))
-    sns.relplot(data = dset, x="started_at",y="relmy", hue = "parity", palette = sns.color_palette("tab10"))
-            dmilk.loc[dmilk.animal_id == cow,"mi"]*3600,"o")
-    ax.set_ylim([0,4])
+    # fig, ax = plt.subplots(nrows=1,ncols=1, figsize= (15,8))
+    # cow = 290  #200, 179, etc
+    # dset = dmilk.loc[dmilk.animal_id == cow,["animal_id","lactation_id","started_at","dim","tmy","mi","parity","gap"]]
+    # dset["relmy"] = dset["tmy"]/dset["mi"]*3600
+    # sns.relplot(data = dset, x="dim",y="relmy", hue = "parity", palette = sns.color_palette("tab10"))
+    # sns.relplot(data = dset, x="started_at",y="relmy", hue = "parity", palette = sns.color_palette("tab10"))
+    # ax.set_ylim([0,4])
    
-    test2 = new.loc[(new["animal_id"]==19)&(new["parity"]==0) & (~new["tmy"].isna()) ,:]
-    cow == 19
-    fig, ax = plt.subplots(nrows=1,ncols=1, figsize= (15,8))
-    ax.plot(test.loc[test.animal_id == cow,"dim"],test.loc[test.animal_id == cow,"tmy"] / \
-            test.loc[test.animal_id == cow,"mi"]*3600,"o")
-    ax.set_ylim([0,4])
\ No newline at end of file
+    # test2 = new.loc[(new["animal_id"]==19)&(new["parity"]==0) & (~new["tmy"].isna()) ,:]
+    # cow == 19
+    # fig, ax = plt.subplots(nrows=1,ncols=1, figsize= (15,8))
+    # ax.plot(test.loc[test.animal_id == cow,"dim"],test.loc[test.animal_id == cow,"tmy"] / \
+    #         test.loc[test.animal_id == cow,"mi"]*3600,"o")
+    # ax.set_ylim([0,4])
\ No newline at end of file