# Exercise session 4, exercise 1
source("resplot.R")
set.seed(1)
n <- 100
p <- 2
xx <- 1:n
yy.a <- 2 + 1 * xx + rnorm(n)
yy.b <- 2 + 1 * xx + rnorm(n) * (xx)
yy.c <- 2 + 1 * xx + rnorm(n) * (1 + xx / n)
yy.d <- cos(xx * pi / (n / 2)) + rnorm(n)

leverage.cutoff <- 2*p/n # Montgomery p. 213
cooks.cutoff <- qf(0.5, p, n - p, lower.tail = FALSE) # Montgomery p. 215
studres.cutoff <- qt(0.05/2, n - p, lower.tail = FALSE) # Montgomery p. 135
leverage.cutoff
cooks.cutoff
studres.cutoff

# a)
par(mfrow = c(2, 3))
plot(yy.a ~ xx, pch = 20)
fit.a <- lm(yy.a ~ xx)
abline(fit.a, col = "red")
plot(fit.a, which = 1:5, pch = 20)
# Normal assumption ok
# No leverage points
# Some (14, 24, 61) large (in absoulte values) residuals but from residuals vs.
# fitted it seems ok. Note that these values are outside the 5% quantile in the
# t(n-p) distrubution, but since we have 100 observations we expect
# about 5 observations to be this "extreme".
infl.a <- influence.measures(fit.a)
summary(infl.a)

par(mfrow = c(2, 3))
plot(yy.b ~ xx, pch = 20)
fit.b <- lm(yy.b ~ xx)
abline(fit.b, col = "red")
plot(fit.b, 1:5, pch = 20)
# Not homogneous variance, residuals seem to depend on x
# Normal assumption seeoms to be violated. (obs, it is not violated but for the
# Q-Q plot to work, it has to be constant variance  )
# Observation 96, 98, 100 seem to have high leverage and high standardized res,
# so may be influential, but by looking at the x-y plot, no point is influential.

par(mfrow = c(2, 3))
plot(yy.c ~ xx, pch = 20)
fit.c <- lm(yy.c ~ xx)
abline(fit.c, col = "red")
plot(fit.c, 1:5, pch = 20)
infl.c <- influence.measures(fit.c)
summary(infl.c)

# As in the previous model, from the residuals vs. fitted we conclude that there
# might might be a dependence between x and e. This dependence also exlpains
# the non-normality indicated by the Q-Q plot.

par(mfrow = c(2, 3))
plot(yy.d ~ xx, pch = 20)
fit.d <- lm(yy.d ~ xx)
abline(fit.d, col = "red")
plot(fit.d, 1:5, pch = 20)
# From y vs x we see that the model is wrong, not a linear dependence
# From residual vs. fitted (Tukey-Anscombe) plot we see that the homogeneuous
# variance assumption is violated.
# Note that the QQ plot seems ok, bute from the residuals vs. fitted we see that
# distributions ofthe residuals is dependent on x.
# obs 12, 37, 42
infl.d <- influence.measures(fit.d)
summary(infl.d)

# b)
resplot(fit.a)
resplot(fit.b)
resplot(fit.c)
resplot(fit.d)

# c)