Skip to content
Snippets Groups Projects
Commit a31c87e9 authored by Dainius's avatar Dainius
Browse files

Add monte carlo example

parent d520dd7e
Branches
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Example of a Monte Carlo analysis
%% Cell type:code id: tags:
``` R
data = read.csv("data/testdata.csv")
```
%% Cell type:code id: tags:
``` R
names(data)
```
%% Output
1. 'Yield'
2. 'GDP_agriculture'
3. 'Farmer_income'
4. 'Inorganic_F_output'
5. 'Large_medium_tractors'
6. 'Small_tractors'
7. 'Towing_Farm_Machinery_LM'
8. 'Towing_Farm_Machinery_S'
9. 'Electronic_engines'
10. 'Diesel_engines'
11. 'Combine_harvesters'
12. 'Motorized_threshing_machines'
13. 'Cereal'
14. 'Beans'
15. 'Tubers'
16. 'Oil'
17. 'Sugar'
18. 'Fiber'
19. 'Tobacco'
\begin{enumerate*}
\item 'Yield'
\item 'GDP\_agriculture'
\item 'Farmer\_income'
\item 'Inorganic\_F\_output'
\item 'Large\_medium\_tractors'
\item 'Small\_tractors'
\item 'Towing\_Farm\_Machinery\_LM'
\item 'Towing\_Farm\_Machinery\_S'
\item 'Electronic\_engines'
\item 'Diesel\_engines'
\item 'Combine\_harvesters'
\item 'Motorized\_threshing\_machines'
\item 'Cereal'
\item 'Beans'
\item 'Tubers'
\item 'Oil'
\item 'Sugar'
\item 'Fiber'
\item 'Tobacco'
\end{enumerate*}
%% Cell type:code id: tags:
``` R
apply(data, 2, sd)
```
%% Output
Yield
: 3342.59590438839GDP_agriculture
: 10750.5475775756Farmer_income
: 3536.75302766377Inorganic_F_output
: 627.538129040905Large_medium_tractors
: 22.8069653965801Small_tractors
: 77.9505302810966Towing_Farm_Machinery_LM
: 25.5965771755867Towing_Farm_Machinery_S
: 100.851908513051Electronic_engines
: 48.2722684768912Diesel_engines
: 34.349969616342Combine_harvesters
: 5.0824301297472Motorized_threshing_machines
: 63.7267475543626Cereal
: 12.7364968021479Beans
: 5.45958396167376Tubers
: 4.53543835755834Oil
: 4.81013315897188Sugar
: 2.27670221829347Fiber
: 4.73323276548032Tobacco
: 1.3736370450376
\begin{description*}
\item[Yield] 3342.59590438839
\item[GDP\textbackslash{}\_agriculture] 10750.5475775756
\item[Farmer\textbackslash{}\_income] 3536.75302766377
\item[Inorganic\textbackslash{}\_F\textbackslash{}\_output] 627.538129040905
\item[Large\textbackslash{}\_medium\textbackslash{}\_tractors] 22.8069653965801
\item[Small\textbackslash{}\_tractors] 77.9505302810966
\item[Towing\textbackslash{}\_Farm\textbackslash{}\_Machinery\textbackslash{}\_LM] 25.5965771755867
\item[Towing\textbackslash{}\_Farm\textbackslash{}\_Machinery\textbackslash{}\_S] 100.851908513051
\item[Electronic\textbackslash{}\_engines] 48.2722684768912
\item[Diesel\textbackslash{}\_engines] 34.349969616342
\item[Combine\textbackslash{}\_harvesters] 5.0824301297472
\item[Motorized\textbackslash{}\_threshing\textbackslash{}\_machines] 63.7267475543626
\item[Cereal] 12.7364968021479
\item[Beans] 5.45958396167376
\item[Tubers] 4.53543835755834
\item[Oil] 4.81013315897188
\item[Sugar] 2.27670221829347
\item[Fiber] 4.73323276548032
\item[Tobacco] 1.3736370450376
\end{description*}
%% Cell type:code id: tags:
``` R
apply(data, 2, mean)
```
%% Output
Yield
: 6593.46772287834GDP_agriculture
: 10201.4513738877Farmer_income
: 2949.04055867127Inorganic_F_output
: 254.1846131035Large_medium_tractors
: 11.6924743524072Small_tractors
: 63.6072795039629Towing_Farm_Machinery_LM
: 16.3953335844288Towing_Farm_Machinery_S
: 72.9273553777213Electronic_engines
: 41.6846566751277Diesel_engines
: 27.3843644679249Combine_harvesters
: 2.27217004751769Motorized_threshing_machines
: 43.9412289986471Cereal
: 56.4691126819931Beans
: 5.37709423312942Tubers
: 5.47573675962985Oil
: 7.43522536621398Sugar
: 1.09479285604573Fiber
: 2.68644616015272Tobacco
: 0.716539257595341
\begin{description*}
\item[Yield] 6593.46772287834
\item[GDP\textbackslash{}\_agriculture] 10201.4513738877
\item[Farmer\textbackslash{}\_income] 2949.04055867127
\item[Inorganic\textbackslash{}\_F\textbackslash{}\_output] 254.1846131035
\item[Large\textbackslash{}\_medium\textbackslash{}\_tractors] 11.6924743524072
\item[Small\textbackslash{}\_tractors] 63.6072795039629
\item[Towing\textbackslash{}\_Farm\textbackslash{}\_Machinery\textbackslash{}\_LM] 16.3953335844288
\item[Towing\textbackslash{}\_Farm\textbackslash{}\_Machinery\textbackslash{}\_S] 72.9273553777213
\item[Electronic\textbackslash{}\_engines] 41.6846566751277
\item[Diesel\textbackslash{}\_engines] 27.3843644679249
\item[Combine\textbackslash{}\_harvesters] 2.27217004751769
\item[Motorized\textbackslash{}\_threshing\textbackslash{}\_machines] 43.9412289986471
\item[Cereal] 56.4691126819931
\item[Beans] 5.37709423312942
\item[Tubers] 5.47573675962985
\item[Oil] 7.43522536621398
\item[Sugar] 1.09479285604573
\item[Fiber] 2.68644616015272
\item[Tobacco] 0.716539257595341
\end{description*}
%% Cell type:code id: tags:
``` R
apply(data, 2, mean) > apply(data, 2, sd)*2
```
%% Output
Yield
: FALSEGDP_agriculture
: FALSEFarmer_income
: FALSEInorganic_F_output
: FALSELarge_medium_tractors
: FALSESmall_tractors
: FALSETowing_Farm_Machinery_LM
: FALSETowing_Farm_Machinery_S
: FALSEElectronic_engines
: FALSEDiesel_engines
: FALSECombine_harvesters
: FALSEMotorized_threshing_machines
: FALSECereal
: TRUEBeans
: FALSETubers
: FALSEOil
: FALSESugar
: FALSEFiber
: FALSETobacco
: FALSE
\begin{description*}
\item[Yield] FALSE
\item[GDP\textbackslash{}\_agriculture] FALSE
\item[Farmer\textbackslash{}\_income] FALSE
\item[Inorganic\textbackslash{}\_F\textbackslash{}\_output] FALSE
\item[Large\textbackslash{}\_medium\textbackslash{}\_tractors] FALSE
\item[Small\textbackslash{}\_tractors] FALSE
\item[Towing\textbackslash{}\_Farm\textbackslash{}\_Machinery\textbackslash{}\_LM] FALSE
\item[Towing\textbackslash{}\_Farm\textbackslash{}\_Machinery\textbackslash{}\_S] FALSE
\item[Electronic\textbackslash{}\_engines] FALSE
\item[Diesel\textbackslash{}\_engines] FALSE
\item[Combine\textbackslash{}\_harvesters] FALSE
\item[Motorized\textbackslash{}\_threshing\textbackslash{}\_machines] FALSE
\item[Cereal] TRUE
\item[Beans] FALSE
\item[Tubers] FALSE
\item[Oil] FALSE
\item[Sugar] FALSE
\item[Fiber] FALSE
\item[Tobacco] FALSE
\end{description*}
%% Cell type:code id: tags:
``` R
covars = c("Farmer_income", "Sugar", "Cereal", "Inorganic_F_output")
```
%% Cell type:code id: tags:
``` R
summary(data[,c("Yield", covars)])
```
%% Output
%% Cell type:code id: tags:
``` R
MyModel = lm(as.formula(paste("Yield~", paste(covars, collapse="+"))), data=data)
```
%% Cell type:code id: tags:
``` R
summary(MyModel)
```
%% Output
%% Cell type:code id: tags:
``` R
# Make a prediction somewhere close to mean values to get close to mean yield
predict(MyModel, data.frame(Farmer_income=3000, Sugar=1, Cereal=57, Inorganic_F_output=255), interval="confidence")
```
%% Output
A matrix: 1 × 3 of type dbl
| fit | lwr | upr |
|---|---|---|
| 6572.062 | 6449.604 | 6694.52 |
A matrix: 1 × 3 of type dbl
\begin{tabular}{r|lll}
fit & lwr & upr\\
\hline
6572.062 & 6449.604 & 6694.52\\
\end{tabular}
%% Cell type:code id: tags:
``` R
# But how does changing Inorganic F output change the yield?
IFO_range_2sd = c(mean(data$Inorganic_F_output)-2*sd(data$Inorganic_F_output),
mean(data$Inorganic_F_output)+2*sd(data$Inorganic_F_output))
IFO_range_2sd # Seems too large because we get negative outputs which is not realistic
```
%% Output
1. -1000.89164497831
2. 1509.26087118531
\begin{enumerate*}
\item -1000.89164497831
\item 1509.26087118531
\end{enumerate*}
%% Cell type:code id: tags:
``` R
IFO_range_quant = quantile(data$Inorganic_F_output, c(0.25, 0.75))
IFO_range_quant # Reasonable, although does not cover all the range in reality (0-1 would, but we may get outliers)
```
%% Output
25%
: 72.0360546275%
: 235.28989945
\begin{description*}
\item[25\textbackslash{}\%] 72.03605462
\item[75\textbackslash{}\%] 235.28989945
\end{description*}
%% Cell type:code id: tags:
``` R
# Run MC
# Sample randomly from the range, run 10000 simulations
IFO_tests = runif(10000, IFO_range_quant[1], IFO_range_quant[2])
head(IFO_tests)
```
%% Output
1. 192.25014293842
2. 189.772745382712
3. 138.904337703773
4. 137.110042957965
5. 115.753210960425
6. 87.798504824545
\begin{enumerate*}
\item 192.25014293842
\item 189.772745382712
\item 138.904337703773
\item 137.110042957965
\item 115.753210960425
\item 87.798504824545
\end{enumerate*}
%% Cell type:code id: tags:
``` R
IFO_MC = predict(MyModel, data.frame(Farmer_income=3000, Sugar=1, Cereal=57, Inorganic_F_output=IFO_tests),
interval="confidence")
```
%% Cell type:code id: tags:
``` R
hist(IFO_MC[,"fit"]) # Yield difference due to change in inorganic F output
```
%% Output
%% Cell type:code id: tags:
``` R
summary(IFO_MC)
```
%% Output
%% Cell type:markdown id: tags:
Not much change: no matter what inorganic F output we have, the yields will be 6580-6640 for the case when farmer has 3000 income and 1 sugar and 57 cereal
## What if we vary cereal?
%% Cell type:code id: tags:
``` R
cereal_range_2sd = c(mean(data$Cereal)-2*sd(data$Cereal),
mean(data$Cereal)+2*sd(data$Cereal))
cereal_range_2sd # No overdispersion this time
```
%% Output
1. 30.9961190776974
2. 81.9421062862888
\begin{enumerate*}
\item 30.9961190776974
\item 81.9421062862888
\end{enumerate*}
%% Cell type:code id: tags:
``` R
cereal_tests = runif(10000, cereal_range_2sd[1], cereal_range_2sd[2])
head(cereal_tests)
```
%% Output
1. 32.4490915480555
2. 57.3854599489991
3. 80.8591409574191
4. 66.2691985280606
5. 71.8470227559234
6. 55.1266650571241
\begin{enumerate*}
\item 32.4490915480555
\item 57.3854599489991
\item 80.8591409574191
\item 66.2691985280606
\item 71.8470227559234
\item 55.1266650571241
\end{enumerate*}
%% Cell type:code id: tags:
``` R
cereal_MC = predict(MyModel, data.frame(Farmer_income=3000, Sugar=1, Cereal=cereal_tests, Inorganic_F_output=254),
interval="confidence")
hist(cereal_MC[,"fit"])
```
%% Output
%% Cell type:code id: tags:
``` R
summary(cereal_MC)
```
%% Output
%% Cell type:markdown id: tags:
We get much bigger variation! Not surprising as cereal has much more impact, looking at the model coefficients.
## What if we have interactions?
If we make the model nonlinear, we might get more interesting results.
%% Cell type:code id: tags:
``` R
NonlinearModel = lm(as.formula(paste("Yield~", paste(covars, collapse="*"))), data=data)
summary(NonlinearModel)
```
%% Output
%% Cell type:code id: tags:
``` R
cereal_MC_nl = predict(NonlinearModel,
data.frame(Farmer_income=3000, Sugar=1, Cereal=cereal_tests, Inorganic_F_output=254),
interval="confidence")
hist(cereal_MC_nl[,"fit"])
```
%% Output
%% Cell type:code id: tags:
``` R
# How about both IFO and cereal at the same time?
cereal_IFO_MC_nl = predict(NonlinearModel,
data.frame(Farmer_income=3000, Sugar=1, Cereal=cereal_tests, Inorganic_F_output=IFO_tests),
interval="confidence")
hist(cereal_IFO_MC_nl[,"fit"])
```
%% Output
%% Cell type:code id: tags:
``` R
# Add also farmer income
FI_range = quantile(data$Farmer_income, c(0.25, 0.75))
FI_tests = runif(10000, FI_range[1], FI_range[2])
cereal_IFO_FI_MC_nl = predict(NonlinearModel,
data.frame(Farmer_income=FI_tests, Sugar=1, Cereal=cereal_tests, Inorganic_F_output=IFO_tests),
interval="confidence")
hist(cereal_IFO_FI_MC_nl[,"fit"])
```
%% Output
%% Cell type:markdown id: tags:
Aha, now our distribution is different: if we vary both farmer income and cereal, we're most likely to have a yield of ~5500, but less likely more and less, and it's completely impossible to have yields above 8000 and below 4000 if our sugar is 1.
%% Cell type:code id: tags:
``` R
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment