#Exercise 26, Consider the AR(1) model
library(nlme);

#a) notes on the black board.
#write the correlation as a exponential correlation function in terms of d (the range parameter).

#The advantage of this formulation is that we are able to generalise the model to situations where the time between time points may differ.


#######################################################################
#b Load the dataset:
Hawaii = read.table("http://www.uio.no/studier/emner/matnat/math/STK3100/data/Hawaii.txt", header=T)
Hawaii$Birds=sqrt(Hawaii$Moorhen.Kauai)
#Birds (the response Y) are measured almost each year, some NA.
#Year  (the time variable)
#Covariates: Rainfall, Stilt.Oahu, Stilt.Maui, Coot.Oahu, Coot.Maui

#Generalized least squares fit by REML: (independent error terms)
M0  =   gls(Birds~Rainfall+Year,na.action=na.omit,data=Hawaii)

# Model1(M1) and Model2(M2) use the exponential correlation function with different parameterisations:
M1  =   gls(Birds~Rainfall+Year,na.action=na.omit,correlation=corAR1(form=~Year),data=Hawaii)
summary(M1)
#phi_hat for model 1 is 0.7734303

M2  =   gls(Birds~Rainfall+Year,na.action=na.omit,correlation=corExp(form=~Year),data=Hawaii)
summary(M2)
#estimated d=range for model 2 is 3.892266
-1/log(0.7734303)     #=3.892266


#######################################################################
#######################################################################
#######################################################################
#We will now look at the dataset spruce, which shows growth of different trees.
#Response (Y) = logSize
#Covariates: days(numerical), plot(factor - const within trees)
spruce = read.table("http://www.uio.no/studier/emner/matnat/math/STK3100/data/Spruce.txt", header=T)

#c  #select a specific tree in the spruce data
Spruce1=Spruce[Spruce$Tree=="O1T18",]

# plot "logSize" against "days":
plot(Spruce1$days,Spruce1$logSize)

#######################################################################
#d: Try out a model with days as a linear fixed effect

#Model with independent errorterm
M0.1=gls(logSize~days,data=Spruce1)
summary(M0.1)
#(Intercept) 3.0034390 0.18888528 15.900864       0
#days        0.0029654 0.00040416  7.337056       0

# Model with exponential correlation function:
M1.1=gls(logSize~days,data=Spruce1,correlation=corExp(form=~days))
summary(M1.1)
#(Intercept) 2.3456068  70.30834 0.0333617  0.9740
#days        0.0040421   0.00184 2.2002955  0.0501

#  d=range=5611265 , rho=exp(-1/d)= 0.9999998

### we can not trust the p-values from M0.1, because the independent assumption is violated.
### Positive dependencies give too little uncertainties and too small p values

#Which model seems to be best suited in this case?
AIC(M0.1,M1.1)
#residuals are much smaller in M1.1


#######################################################################
#e  #Select all trees in the spruce data
    #It is then possible to include plot as a covariate.

#Model with indep error term
M0=gls(logSize~days+factor(plot),data=Spruce)
summary(M0)
#(Intercept)   4.023500 0.06015360 66.88710  0.0000
#days          0.003322 0.00011248 29.53293  0.0000
#factor(plot)2 0.016011 0.05097519  0.31410  0.7535
#factor(plot)3 0.231617 0.06498087  3.56438  0.0004
#factor(plot)4 0.376849 0.06322694  5.96025  0.0000

# Model with exponential correlationfunction:
cexp=corExp(form=~days|Tree,fixed=FALSE)   #exponential correlation function will be used on every single tree
M1=gls(logSize~days+factor(plot),Spruce,correlation=cexp)
summary(M1)
#(Intercept)    3.534956 0.13444253 26.293437  0.0000
#days           0.004066 0.00016719 24.319393  0.0000
#factor(plot)2 -0.001693 0.16313073 -0.010376  0.9917
#factor(plot)3  0.181998 0.20795170  0.875194  0.3817
#factor(plot)4  0.323940 0.20233877  1.600977  0.1097

#  d=range=596.0755 , rho=exp(-1/d)= 0.9983238


#Compare models
AIC(M0,M1)

#####################################################################
#Find the optimal model by selecting the significant fixed effects (specify to use ML):

#indep error term
M0.0 =gls(logSize~1                ,data=Spruce, method="ML")
M0.d =gls(logSize~days             ,data=Spruce, method="ML")
M0.dp=gls(logSize~days+factor(plot),data=Spruce, method="ML")
anova(M0.0,M0.d,M0.dp)
#low p-values, sign different, include all fixed effects.
### BUT: we can not trust these p-values

#exp cor-function
M1.0 =gls(logSize~1                ,data=Spruce, correlation=cexp, method="ML")
M1.d =gls(logSize~days             ,data=Spruce, correlation=cexp, method="ML")
M1.dp=gls(logSize~days+factor(plot),data=Spruce, correlation=cexp, method="ML")
anova(M1.0,M1.d,M1.dp)
#"factor(plot)" is not significant when we take the correlation into account.


# The two models for correlation structure have a big influence on the model selection for the fixed effects!