Chapter 14 Modelling

# # ---- Load Packages ----
# #library(lattice)
# library(MASS)
# library(pscl)
# library(tidyverse)
# library(broom)
# library(jtools)
# library(ggstance)
# library(ggeffects)
# library(modelr)
# source("ModelFunctions.txt")
# 
# # Read in Data ------------------------------------------------------------
# 
# Fish<-read.delim(file="Baileyetal2008.txt")
# 
# # Inspect Data
# str(Fish)
# names(Fish)
# summary(Fish)
# 
# # Subset Data to Omit NAs
# Fish2<-na.exclude(Fish)
# 
# # Remove the Spatial Outlier 
# Fish3<-Fish2[c(-135), ]
# Fish<-Fish3
# 
# # Express Depth in km
# Fish$MeanDepth<-Fish$MeanDepth/1000
# 
# # Look at some data
# table(Fish$TotAbund,Fish$Period,useNA = "always")
# 
# 
# # ---- Underlying Question ----
# # Has the Abundance - Depth relationship changed over time?
# 
# # ---- Figure 1.1 ----
# ggplot(Fish)+
#   geom_point(aes(x=MeanDepth, y=TotAbund))+
#   theme_bw(22)+
#   ylab("Total Abundance")+
#   xlab("Mean Depth")+
#   facet_grid(.~Period)
# 
# ggplot(Fish)+
#   geom_point(aes(x=MeanDepth, y=TotAbund))+
#   theme_bw(22)+
#   ylab("Total Abundance")+
#   xlab("Mean Depth")
# 
# 
# # ---- MODEL 0: Linear Regression ----
# #Express a model for this data.
# #
# M0<-lm(TotAbund~MeanDepth,data=Fish)
# str(M0)
# summary(M0) # Summary
# 
# #Add residuals
# Fish<-Fish%>%add_residuals(M0)%>%
#   add_predictions(M0)
# table(Fish$resid, useNA = "always")
# str(Fish)
# #Ordinary Residuals
# #Create the ordinary residuals using the raw data and fitted values
# #M0$coefficients
# #M0$residuals
# #E0 <- residuals(M0) # Residuals
# #F0 <- fitted(M0) # Fitted Values
# 
# # ---- Histogram of Residuals ----
# #ggplot()+
# #  geom_histogram(aes(x=E0), fill='grey50', col='black')+
# #  theme_bw(22)+ylab("Frequency")+xlab("Residuals")
# 
# ggplot(Fish)+
#   geom_histogram(aes(x=resid), fill='grey50', col='black')+
#   theme_bw(22)+ylab("Frequency")+xlab("Residuals")
# 
# #From this plot, are the residuals normally distributed?
# #What is the mean and standard deviation of the residuals?
# mean(Fish$resid) #Essentially zero!
# median(Fish$resid)
# sd(Fish$resid) #Quite large (but the data have some large values, so don't be fooled!)
# 
# # ---- Plot Residuals vs Fitted Values ----
# #ggplot()+
# #  geom_point(aes(x=F0,y=E0))+
# #  geom_hline(yintercept=0, linetype='dashed', col='blue')+
# #  theme_bw(22)+ylab("Residuals")+xlab("Fitted values")
# 
# ggplot(Fish)+
#   geom_point(aes(x=pred,y=resid))+
#   geom_hline(yintercept=0, linetype='dashed', col='blue')+
#   theme_bw(22)+ylab("Residuals")+xlab("Fitted values")
# 
# #What pattern is present?
# 
# #What does this pattern mean?
# 
# #What is the underlying assumption of 
# #error terms in a model?
# 
# 
# # ---- Plot Data and Model Line ----
# ggplot(Fish)+
#   geom_point(aes(x=MeanDepth, y=TotAbund))+
#   stat_smooth(aes(x=MeanDepth, y=TotAbund), method='lm', se=FALSE, col='red')+
#   theme_bw(22)+ylab("Total Abundance")+xlab("Mean Depth")
# 
# # ---- Plot Residuals Against Variables ----
# #We use the function pr.fac for this.
# pr.fac(M0,as.factor(Fish$Period))
# 
# # ---- Biologically Meaningful ----
# #Plot the model again, but include a line at zero.
# bio<-ggplot(Fish)+
#   geom_point(aes(x=MeanDepth, y=TotAbund), size=3)+
#   stat_smooth(aes(x=MeanDepth, y=TotAbund), size=1, method='lm', se=FALSE,col='red')+
#   geom_hline(yintercept=0, linetype='dashed', col='blue', size=1)+
#   theme_bw(22)+xlab("Mean Depth")+ylab("Total Abundance")
# bio
# #Are values being predicted that are impossible given the data?
# 
# # What if we look at the predicted standard error around the model line? #
# summary(M0)
# 
# #Observations do not all fall on the regression line.
# #Residuals represent the actual differences of points to the regression line.
# #Errors are the modeled distribution about the regression line.
# # Errors have the form:
# #e~N(0,sigma)
# #which means normally distributed (N) with a mean of zero (0) and 
# #a variance of sigma.
# 
# # Set Up #
# a<-range(Fish$MeanDepth) #Save the range values from MeanDepth
# md<-seq(a[1],a[2],length=10) #Create a series of 10 values spanning the range of MeanDepth
# beta<-coef(M0) #Save the coefficients from M0
# 
# 
# MeanDepth<-c()
# Estimates<-c()
# for (i in 1:10)
# {
#   MeanDepth.in<-rep(md[i],100)
#   MeanDepth<-c(MeanDepth, MeanDepth.in)
#   yhat<-beta[1]+beta[2]*md[i]
#   Estimates.in<-rnorm(100,mean=yhat,sd=summary(M0)$sigma)
#   Estimates<-c(Estimates, Estimates.in)
# }
# bio.check<-data.frame(MeanDepth,Estimates)
# # Plot Results #
# bio+geom_point(aes(x=MeanDepth, y=Estimates),col='grey50',pch=5,data=bio.check)
# 
# # ---- The Poisson Distribution ----
# #Poisson distributions have the same mean and variance
# #denoted by 'lambda' or λ in most texts.
# 
# # The Poisson distribution has density
# # p(x) = λ^x exp(-λ)/x!
# #  for x = 0, 1, 2, …  The mean and variance are E(X) = Var(X) = λ.
# 
# #Therefore, given that λ in a population is 3, what are the 
# #probabilities that λ is actually 0,1,2,3,4, 5 or 15?
# dpois(0,lambda=3)
# dpois(1,lambda=3)
# dpois(2,lambda=3)
# dpois(3,lambda=3)
# dpois(4,lambda=3)
# dpois(5,lambda=3)
# dpois(15,lambda=3)
# 
# #Plot a series of values from 0 to 10 (0:10 creates a series)
# #using the same value for λ i.e. 3
# ggplot(transform(data.frame(x=c(0:10)), y=dpois(x, 3)), aes(x, y)) + 
#   geom_bar(stat="identity")
# 
# #How does the shape of the Poisson distribution change if
# # λ changes?
# ggplot(transform(data.frame(x=c(0:10)), y=dpois(x, 1)), aes(x, y)) + 
#   geom_bar(stat="identity")
# ggplot(transform(data.frame(x=c(0:10)), y=dpois(x, 2)), aes(x, y)) + 
#   geom_bar(stat="identity")
# ggplot(transform(data.frame(x=c(0:15)), y=dpois(x, 5)), aes(x, y)) + 
#   geom_bar(stat="identity")
# ggplot(transform(data.frame(x=c(0:25)), y=dpois(x, 10)), aes(x, y)) + 
#   geom_bar(stat="identity")
# ggplot(transform(data.frame(x=c(0:200)), y=dpois(x, 100)), aes(x, y)) + 
#   geom_bar(stat="identity")
# #Which of these plots looks to have normally distributed data?
# 
# # ---- MODEL 1: GLM ----
# #Using the Poisson distribution
# #By default the Poisson distribution uses the log link function
# M1<-glm(TotAbund~MeanDepth,data=Fish,
#         family=poisson(link="log"))
# #M1b<-lm(log(TotAbund)~MeanDepth,data=Fish)
# summary(M1)
# #summary(M1b)
# #summary(M0)
# 
# #Residuals and fitted values
# Fish<-Fish%>%add_residuals(M1)%>%
#   add_predictions(M1)
# str(Fish)
# 
# E1<-resid(M1)
# #E1<-resid(M1, type="deviance") #Same! As deviance residuals are the default for glm
# F1<-fitted(M1)
# 
# #The function predict() is used to get fitted values of y
# eta<-predict(M1,type="link") 
# #estimates on the same scale as the linear predictor
# 
# #The scale of the fitted values depends on the 'type' argument
# eta1<-predict(M1,type="response") 
# #estimates on the same scale as the response
# 
# #Taking the log of eta1 (response scale) results in the linear predictor scale
# log(eta1)
# 
# #Coefficients from the model are indexed
# beta1<-coef(M1)[1]
# beta2<-coef(M1)[2]
# summary(M1)
# #Here we use the model formula to predict values rather than predict()
# #The mean depths are taken from the fish dataframe directly.
# eta<-beta1+beta2*Fish$MeanDepth
# 
# #mu is the fitted values on the response scale
# #the 'anti-log' on the loge scale is exponentiating values using exp
# mu<-exp(eta)
# #y is the responses (original data)
# 
# y<-Fish$TotAbun 
#  
# #LogL is the logLiklihood function
# LogL<-sum(Fish$TotAbun*eta-mu-lgamma(Fish$TotAbun+1))
# LogL
# 2*sum(y*log(y/mu)-(y-mu)) #used in model selection!
# 
# # ---- Plot Residuals vs Fitted Values ----
# ggplot()+
#   geom_point(aes(x=F1,y=E1))+
#   geom_hline(yintercept=0, linetype='dashed', col='blue')+
#   theme_bw(22)+ylab("Residuals")+xlab("Fitted values")
# 
# ggplot(Fish)+
#   geom_point(aes(x=pred,y=resid))+
#   geom_hline(yintercept=0, linetype='dashed', col='blue')+
#   theme_bw(22)+ylab("Residuals")+xlab("Fitted values")
# 
# ggplot(Fish)+
#   geom_point(aes(x=MeanDepth,y=resid))+
#   geom_hline(yintercept=0, linetype='dashed', col='blue')+
#   theme_bw(22)+ylab("Residuals")+xlab("Fitted values")
# 
# #Plot estimates vs. fitted values
# ggplot(Fish)+
#   geom_point(aes(x=eta,y=resid))+
#   geom_hline(yintercept=0, linetype='dashed', col='blue')+
#   theme_bw(22)+ylab("ETA")+xlab("Fitted values")
# 
# # ---- Predict Values Using the Model ----
# # You need to set up a dataframe that contains
# # combinations of all variables included as explanatory
# # variables in your model!
# M1<-glm(TotAbund~MeanDepth,data=Fish,family=poisson(link="log"))
# 
# predict_response(M1)%>%plot(show_data=TRUE)
# 
# # In this case, its simple, we only need MeanDepth
# # First, let's make it easy. Use depths of 1,2,3,4 to predict abundance
# newdata<-data.frame(MeanDepth=c(1,2,3,4,5))
# newdata
# predict(M1,newdata,type='response') # We end up with 4 predicted value
# # Save these predictions as a new column in newdata
# newdata$TotAbund<-predict(M1,newdata,type='response')
# newdata
# 
# # Plot #
# ggplot()+
#   geom_point(aes(x=MeanDepth, y=TotAbund), data=Fish, size=3)+
#   geom_path(aes(x=MeanDepth, y=TotAbund), data=newdata, col="red", size=1)+
#   theme_bw(22)+ylab("Total Abundance")+xlab("Mean Depth")
# 
# # More complicated: Let's use the range of MeanDepth to generate
# # a sequence of depth values that will be used to predict abundance
# newdata<-data.frame(MeanDepth=seq(from=range(Fish$MeanDepth)[1],
#                                   to=range(Fish$MeanDepth)[2],
#                                   length=25))
# newdata
# predict(M1,newdata,type='response') # Now we get 25 predictions
# # Save these predictions as a new column in newdata
# newdata$TotAbund<-predict(M1,newdata,type='response')
# newdata
# 
# # Plot #
# predictions<-ggplot()+
#   geom_point(aes(x=MeanDepth, y=TotAbund), data=Fish, size=3)+
#   geom_path(aes(x=MeanDepth, y=TotAbund), data=newdata, col="red", size=1)+
#   theme_bw(22)+ylab("Total Abundance")+xlab("Mean Depth")
# predictions
# 
# # Simulate Dispersion Around the Predicted Values #
# md<-seq(from=range(Fish$MeanDepth)[1],
#           to=range(Fish$MeanDepth)[2],
#           length=25)
# beta<-coef(M1)
# MeanDepth<-c()
# Estimates<-c()
# for(i in 1:25)
# {
#   MeanDepth.in<-rep(md[i],100)
#   MeanDepth<-c(MeanDepth, MeanDepth.in)
#   mu<-exp(beta[1]+beta[2]*md[i])
#   Estimates.in<-rpois(100,lambda=mu)
#   Estimates<-c(Estimates, Estimates.in)
# }
# bio.check<-data.frame(MeanDepth,Estimates)
# 
# # Plot #
# predictions+
#   geom_point(aes(x=MeanDepth, y=Estimates),col='grey50',pch=5,data=bio.check)
# 
# # ---- Histogram of Residuals ----
# ggplot()+
#   geom_histogram(aes(x=E1), fill='grey50', col='black')+
#   theme_bw(22)+ylab("Frequency")+xlab("Residuals")
# 
# # ---- Plot Residuals Against Variables ----
# pr.fac(M1,Fish$MeanDepth)
# pr.fac(M1,as.factor(Fish$Period))
# 
# #Can we use R2 like in a regression model?
# #Yes, but it is not calculated the same way because we use deviance residuals.
# #Therefore, it is a different statistic with similar meaning
# 
# #Explained Deviance
# summary(M1)
# str(M1) #look for values needed for calculation
# #Calculate ratio of difference of null deviance to model deviance as a percentage
# ((M1$null.deviance-M1$deviance)/M1$null.deviance)*100
# 
# summary(M1)
# exp(-0.628)
# exp(6.643)
# summary(M0)
# 
# # ---- Checking for Overdispersion ----
# # FUNCTION: dispersion()
# # Calculates dispersion
# # Ability to specify model type
# # Input: model, modeltype
# dispersion<-function(model,modeltype='gaussian')
# {
#   A<-sum(resid(model,type="pearson")^2)
#   if(modeltype %in% c("g","p","qp","gaussian","poisson","quasipoisson"))
#   {
#     B<-length(resid(model))-length(coef(model))
#   }
#   if(modeltype %in% c("nb","negativebinomial"))
#   {
#     B<-length(resid(model))-(length(coef(model))+1)
#   }
#   DISP<<-A/B
#   return(DISP)
# }
# 
# dispersion(M1, modeltype = "poisson")
# 
# # ---- Plot Cook's Distance ----
#  
# # Which are greater than 1? #
# CD$x[CD$cooks>1]
# 
# # Can we remove that rather large value?
# I<-1:nrow(Fish)
# I[cooks.distance(M1)>10]
# 
# #Run model with removing observation 23
# M1a<-glm(TotAbund~MeanDepth,data=Fish,family=poisson(link="log"), subset = -23)
# dispersion(M1a) #reduction in dispersion...
# 
# # ---- Spatial Plot ----
# names(Fish)
# ggplot(Fish)+
#   geom_point(aes(x=Xkm, y=Ykm, col=as.factor(Period)), size=3)+
#   theme_bw(22)+ylab("Y Coordinate")+xlab("X Coordinate")+
#   scale_colour_manual(values=c("red","blue"),name="Period")
# 
# # ---- MODEL 2: Adding a Factor ----
# str(Fish)
# Fish$fPeriod<-as.factor(Fish$Period)
# str(Fish)
# summary(M1)
# M2<-glm(TotAbund~MeanDepth*fPeriod, data=Fish, family='poisson')
# summary(M2)
# E2<-resid(M2, type="pearson")
# dispersion(M2)
# 
# #What are the coefficients? Calculate them from the output.
# 
# tidy(M2)
# coef(M2)
# plot_summs(M2)
# plot_summs(M2, plot.distributions = TRUE, inner_ci_level = .9)
# 
# # ---- MODEL 3: Adding an Offset ----
# Fish$logSweptArea<-log(Fish$SweptArea)
# M3<-glm(TotAbund~MeanDepth*fPeriod, 
#         data=Fish, family='poisson')
# summary(M3)
# E3<-resid(M3,type='pearson')
# dispersion(M3)
# 
# 
# # ---- MODEL 4: Negative Binomial ----
# library(pscl)
# M4<-glm.nb(TotAbund~MeanDepth*fPeriod,data=Fish)
# #library(msme)
# #M4b<-nbinomial(TotAbund~MeanDepth*fPeriod+offset(logSweptArea),data=Fish)
# #coefficients(M4b)
# summary(M4)
# E4<-resid(M4,type='pearson')
# F4<-fitted(M4, type="response")
# dispersion(M4,modeltype='nb')
# ggplot()+
#   geom_point(aes(x=F4, y=E4))+theme_bw(22)+
#   geom_hline(yintercept=0, linetype='dashed', col='blue')+
#   ylab("Residuls")+xlab("Fitted Values")
# 
# # Check Levels #
# drop1(M4,test="Chi")
# 
# # ---- MODEL 5: Drop Levels ----
# M5<-glm.nb(TotAbund~MeanDepth+fPeriod,data=Fish)
# summary(M5)
# E5<-resid(M5,type='pearson')
# F5<-predict(M5, type="response")
# dispersion(M5,modeltype="nb")
# 
# # Check Levels #
# drop1(M5,test="Chi")
# 
# # ---- Plotted Residuals vs Fitted Values ----
# ggplot()+
#   geom_point(aes(x=F5,y=E5))+
#   geom_hline(yintercept=0, linetype='dashed', col='blue')+
#   theme_bw(22)+ylab("Residuals")+xlab("Fitted values")
# 
# # ---- Plot Cook's Distance ----
# CD<-data.frame(x=1:length(cooks.distance(M5)),
#                cooks=cooks.distance(M5))
# ggplot(CD)+
#   geom_bar(aes(x=x, y=cooks), stat='identity', fill='grey50', col='black')+
#   theme_bw(22)+ylab("Cook's Distance")+xlab("Index")
# 
# # Which are greater than 1? #
# CD$x[CD$cooks>1]
# 
# # ---- Plot Residuals Against Variables ----
# pr.fac(M5,Fish$MeanDepth,"Mean Depth")
# 
# # Check Against Additional Factor Level #
# # (Not part of pr.fac funtionality... yet!)
# xyplot(E5 ~ MeanDepth | factor(Period), 
#        data = Fish3,
#        xlab = list(label = "Mean depth (km)", cex = 1.5),
#        ylab = list(label = "Pearson residuals", cex = 1.5),
#        panel = function(x,y)
#          {
#          panel.points(x,y, col = 1, pch = 16, cex = 0.7)
#          panel.loess(x,y, col = 1, lwd = 2)
#          panel.abline(h=0)
#          }
#        )
# 
# # ---- Predict Values Using the Model ----
# # As before, Mean Depth is a sequence of values across the range of Depth
# # This time we want to look at 2 Periods, so we need all combinations of both
# newdata<-data.frame(MeanDepth=rep(seq(from=min(Fish$MeanDepth, na.rm=TRUE),
#                                    to=max(Fish$MeanDepth, na.rm=TRUE),
#                                    length=25),2),
#                       fPeriod=c(rep("1",25),rep("2",25)),
#                       logSweptArea=mean(log(Fish$SweptArea)))
# 
# newdata$TotAbund<-predict(M5,newdata,type="response")
# 
# plotM5<-ggplot()+
#   geom_point(aes(x=MeanDepth, y=TotAbund, col=fPeriod), data=Fish, size=3)+
#   geom_path(aes(x=MeanDepth, y=TotAbund, col=fPeriod), data=newdata, size=1)+
#   theme_bw(22)+ylab("Total Abundance")+xlab("Mean Depth")+
#   scale_colour_manual(values=c("red","blue"),name="Period")
# plotM5
# 
# # Add CI #
# newdata$fit<-predict(M5,newdata,type="link",se=TRUE)$fit
# newdata$se<-predict(M5,newdata,type="link",se=TRUE)$se
# 
# plotM5+
#   geom_path(aes(x=MeanDepth, y=exp(fit-1.96*se), col=fPeriod),
#             alpha=0.5, data=newdata)+
#   geom_path(aes(x=MeanDepth, y=exp(fit+1.96*se), col=fPeriod),
#             alpha=0.5, data=newdata)
# 
# # ---- Just to Demonstrate Removing fPeriod ----
# M6<-glm.nb(TotAbund~MeanDepth,data=Fish)
# summary(M6)
# E6<-resid(M6,type='pearson')
# F6<-predict(M6,type="link")
# 
# # Create newdata for predict()
# newdata<-data.frame(MeanDepth=seq(from=min(Fish$MeanDepth, na.rm=TRUE),
#                                    to=max(Fish$MeanDepth, na.rm=TRUE),
#                                    length=25),
#                       logSweptArea=mean(log(Fish$SweptArea)))
# newdata$TotAbund<-predict(M6,newdata,type="response")
# 
# plotM6<-ggplot()+
#   geom_point(aes(x=MeanDepth, y=TotAbund), data=Fish, size=3)+
#   geom_path(aes(x=MeanDepth, y=TotAbund), data=newdata, size=1, col='red')+
#   theme_bw(22)+ylab("Total Abundance")+xlab("Mean Depth")
# plotM6
# 
# # Add CI #
# # Add CI #
# newdata$fit<-predict(M6,newdata,type="link",se=TRUE)$fit
# newdata$se<-predict(M6,newdata,type="link",se=TRUE)$se
# 
# plotM6+
#   geom_path(aes(x=MeanDepth, y=exp(fit-1.96*se)),
#             alpha=0.5, data=newdata, col='red')+
#   geom_path(aes(x=MeanDepth, y=exp(fit+1.96*se)),
#             alpha=0.5, data=newdata, col='red')
# 
# # Simulate SE Around Mean #
# MeanDepth<-c()
# Estimate<-c()
# for(i in 1:nrow(newdata))
# {
#   MeanDepth.in<-rep(newdata$MeanDepth[i],100)
#   MeanDepth<-c(MeanDepth,MeanDepth.in)
#   Estimate.in<-rnbinom(n=100,size=M6$theta, mu=newdata$TotAbund[i])
#   Estimate<-c(Estimate, Estimate.in)
# }
# tryme<-data.frame(MeanDepth,Estimate)
# 
# plotM6+
#   geom_point(aes(x=MeanDepth, y=Estimate), col="grey50",data=tryme)

# # ---- Read in Data ----
# cod<-read.delim("ParasiteCod.txt")
# str(cod)
# # ---- Set Factors ----
# cod$fArea<-as.factor(cod$Area)
# cod$fYear<-as.factor(cod$Year)
# str(cod)
# 
# # ---- Remove NAs ----
# I1<-is.na(cod$Intensity)|is.na(cod$fYear)|is.na(cod$fArea)|is.na(cod$Length)
# cod2<-cod[!I1,]
# cod<-cod2
# 
# # ---- Histogram of Intensity ----
# plot(table(cod$Intensity), ylab="Frequency", xlab="Observed Intensity")
# 
# 
# # NULL model --------------------------------------------------------------
# #A NULL model is a model that just has an intercept (no slope!)
# #What does a NULL model represent?
# glmNULL<-glm(Intensity~1,data=cod)
# summary(glmNULL)
# 
# #Compare with the mean of the response
# mean(cod$Intensity)
# 
# #A more complex model
# glm0<-glm(Intensity~fArea,data=cod)
# summary(glm0)
# 
# plot(Intensity~fArea, data=cod)
# plot(cod$fArea,cod$Intensity)
# 
# # Comparing Models --------------------------------------------------------
# #If we want to compare two models to determine, statistically, if one
# #fits better, we have several choices, but these rely on what kind of comparisons
# #are being made i.e. the model types.
# 
# #Nested model
# anova(glmNULL, glm0, test="F")
# anova(glm0, glmNULL, test="F")
# anova(glmNULL, glm0, test="Chisq")
# lrtest(glmNULL, glm0)
# 
# #Nested and non-nested models
# #But does not work with models not fit using Maximum Likelihood!
# vuong(glmNULL, glm0)
# 
# # AIC/BIC -----------------------------------------------------------------
# #AIC or Akaike's Information Criterion is another way to assess models
# #BIC is a variant
# #The model with the lowest AIC is the best!
# #Models within about 2 AIC points are not different (but this is not
# #always appropriate and often abused)
# 
# AIC(glmNULL, glm0)
# 
# # GLM Model ---------------------------------------------------------------
# glm1<-glm(Intensity~fArea, family = "poisson",data=cod)
# summary(glm1)
# 
# glm2<-glm(Intensity~fArea*fYear+Length, family = "poisson",data=cod)
# summary(glm2)           
# levels(cod$fArea)
# levels(cod$fYear)
# 
# AIC(glm1, glm2)
# 
# 
# # ---- Set up zero inflated model with Poisson distribution ----
# zip1<-zeroinfl(Intensity~fArea*fYear+Length|fArea*fYear+Length,
#                dist="poisson", link="logit",data=cod)
# #zip1<-zeroinfl(Intensity~fArea*fYear+Length|fArea,
# #               dist="poisson", link="logit",data=cod)
# summary(zip1)
# dispersion(zip1, modeltype='zp')
# 
# # ---- Set up zero inflated model with NB distribution ----
# zinb1<-zeroinfl(Intensity~fArea*fYear+Length|fArea*fYear+Length,
#                dist="negbin", link="logit",data=cod, na.action = na.exclude)
# summary(zinb1)
# dispersion(zinb1, modeltype='znb')
# 
# # ---- Likelihood Ratio Test ----
# lrtest(zip1,zinb1) # We see that the negative binomial model (zip2) is better
# vuong(zip1,zinb1)
# vuong(glm1,zinb1)
# 
# # ---- AIC ----
# AIC(zip1,zinb1)
# BIC(zip1,zinb1)
# 
# # ---- More Models ----
# # Drop Interaction from Count Model #
# zinb2<-zeroinfl(Intensity~fArea+fYear+Length|fArea*fYear+Length,
#                dist="negbin", link="logit",data=cod)
# summary(zinb2)
# dispersion(zinb2, modeltype='znb')
# #lrtest()
# 
# # Drop Interaction from Binomial Part of the Model #
# zinb3<-zeroinfl(Intensity~fArea*fYear+Length|fArea+fYear+Length,
#                dist="negbin", link="logit",data=cod)
# summary(zinb3)
# dispersion(zinb3, modeltype='znb')
# 
# # Compare #
# lrtest(zinb1, zinb2)
# lrtest(zinb1, zinb3)
# vuong(zinb1, zinb3)
# lrtest(zinb2, zinb3)
# vuong(zinb2, zinb3)
# AIC(zinb1, zinb2, zinb3)
# 
# #Example: zinb2 has a lower log likelihood and hence a poorer fit than zinb1. 
# #The LRT is telling us that the degree to which we made zinb1 a poorer model than zinb1 
# #is unexpectedly large if the terms that are different between the models were useful 
# #(explained the response). 
# 
# lrtest(zinb1)
# #lrtest(zinb1) is compared with a NULL model that uses the mean as the predictor
# # Intensity ~ 1 is the null model, 
# #It says that the best predictor of Intensity is not the sample mean of Intensity 
# #(the intercept/constant term).
# 
# # The optimal model is zinb1 #
# # Intensity ~ fArea * fYear + Length | fArea * fYear + Length #
# summary(zinb1)
# 
# # ---- Model Validation ----
# # Plot Fitted vs Residuals #
# predict_response(zinb1)%>%plot(show_data=TRUE)
# predict_response(zinb1)%>%plot(ci_style = "errorbar", dot_size = 1.5)
# 
# #Add residuals
# #This method works on simple models and appends residuals to mydata
# cod<-cod%>%
#   add_residuals(zinb1)%>%
#   add_predictions(zinb1)
# 
# table(cod$resid, useNA = "always")
# 
# #Plot residuals
# #There is a lot more to do here...this is an example
# ggplot(cod)+
#   geom_histogram(aes(x=resid), fill='grey50', col='black')+
#   theme_bw(22)+ylab("Frequency")+xlab("Residuals")
# 
# ggplot(cod)+
#   geom_point(aes(x=Length,y=resid))+
#   theme_bw(22)+ylab("Residuals")+xlab("Length")
# 
# ggplot(cod)+
#   geom_point(aes(x=pred,y=resid), fill='grey50', col='black')+
#   theme_bw(22)+ylab("Residuals")+xlab("Predicted")
# 
# # Find Dispersion #
# dispersion(zinb1, modeltype='znb')
# # Set up newdata for use in predict function #
# 
# # --- Plot models ----
# a<-predict_response(zinb1, terms = c("Length", "fArea", "fYear"))%>%
#   plot(show_data=TRUE, facets=TRUE)
# 
# b<-predict_response(glm2, terms = c("Length", "fArea", "fYear"))%>%plot(show_data=TRUE, facets=TRUE)
# 
# c<-predict_response(zip1, terms = c("Length", "fArea", "fYear"))%>%plot(show_data=TRUE, facets=TRUE)
# 
# a+b+c