#1234567890123456789012345678901234567890123456789012345678901234567890123456789 #Ruth Meili # Aufgabe 1 der Serie 3: # Teilaufgabe a) # Daten einlesen: d.air<-read.table("http://stat.ethz.ch/Teaching/Datasets/NDK/ozon.newyork.dat",header=T) d.ozone<-d.air[,1] # Ozonwerte d.logozone<-log(d.ozone) # Logarithmierte Ozonwerte # Dichteschätzer und Boxplots: par(mfrow=c(2,2)) plot(density(d.ozone)) plot(density(d.logozone)) boxplot(d.ozone) boxplot(d.logozone) # Kurtosis und Skewness Funktionen: kurt<-function(x) { xbar <- mean(x) n <- length(x) s <- sqrt(var(x)) * sqrt((n - 1)/n) num <- sum((x - xbar)^4)/n den <- s^4 return(num/den - 3) } skew<-function(x) { xbar <- mean(x) n <- length(x) s <- sqrt(var(x)) * sqrt((n - 1)/n) num <- sum((x - xbar)^3)/n den <- s^3 return(num/den) } cat("-------------------------------------------------------------","\n") print("skewness and kurtosis for d.ozone:") print(c(skew(d.ozone),kurt(d.ozone))) cat("-------------------------------------------------------------","\n") print("skewness and kurtosis for log d.ozone:") print(c(skew(d.logozone),kurt(d.logozone))) cat("-------------------------------------------------------------","\n") print("skewness and kurtosis for log ozone without 17:") print(c(skew(d.logozone[-17]),kurt(d.logozone[-17]))) cat("-------------------------------------------------------------","\n") #Teilaufgabe b) # Scatterplots d.air<-cbind(d.air,d.logozone) pairs(d.air,panel=panel.smooth) # Teilaufgabe c) d.air<-d.air[-17,] # Ausreisser entfernen # Lineares Model mit ozone als Zielvariable: lm.oz<-lm(formula = ozone ~ radiation+temperature+wind,data=d.air) res.lm.oz<-lm.oz$residuals fit.lm.oz<-lm.oz$fitted.values print(summary(lm.oz)) # Lineares Model mit Log ozone als Zielvariable: lm.loz<-lm(formula = d.logozone ~ radiation+temperature+wind,data=d.air) res.lm.loz<-lm.loz$residuals fit.lm.loz<-lm.loz$fitted.values print(summary(lm.loz)) # Die drei Variablen sind klar signifikant # Teilaufgabe d) library(mgcv) # Additives Model mit ozone als Zielvariable: am.oz<-gam(formula = ozone ~ s(radiation)+s(temperature)+s(wind),data=d.air) res.am.oz<-am.oz$residuals fit.am.oz<-am.oz$fitted.values print(summary(am.oz)) # Additives Model mit d.logozone als Zielvariable: am.loz<-gam(formula = d.logozone ~ s(radiation)+s(temperature)+s(wind),data=d.air) res.am.loz<-am.loz$residuals fit.am.loz<-am.loz$fitted.values print(summary(am.loz)) # Die drei Variablen sind klar signifikant # Graphische Darstellung von c) und d) # Residuen versus Fitted values (R^2 wird auch berechnet) par(mfrow=c(2,2)) plot(fit.lm.oz,res.lm.oz) # Tukey-Anscomb-Plot abline(h=0,lty=2) r2<-cor(fit.lm.oz,res.lm.oz+fit.lm.oz)^2 title(paste("Linear without transformation R2=",round(r2,2))) plot(fit.lm.loz,res.lm.loz) abline(h=0,lty=2) r2<-cor(fit.lm.loz,res.lm.loz+fit.lm.loz)^2 title(paste("Linear with transformation R2=",round(r2,2))) plot(fit.am.oz,res.am.oz) abline(h=0,lty=2) r2<-cor(fit.am.oz,res.am.oz+fit.am.oz)^2 title(paste("Additive without transformation R2=",round(r2,2))) plot(fit.am.loz,res.am.loz) abline(h=0,lty=2) r2<-cor(fit.am.loz,res.am.loz+fit.am.loz)^2 title(paste("Additive with transformation R2=",round(r2,2))) # Die R2 sind höher beim "Additiven Modell". # Auch die Residuen sehen besser aus beim "additiven Modell". #------------------------------------------------------------------------ # Absolut-Residuen versus Fitted values # (Spearman Korrelation wird auch berechnet) par(mfrow=c(2,2)) plot(fit.lm.oz,abs(res.lm.oz)) abline(h=0,lty=2) r<-cor(rank(fit.lm.oz),rank(abs(res.lm.oz))) title(paste("Linear without transformation r=",round(r,2))) plot(fit.lm.loz,abs(res.lm.loz)) abline(h=0,lty=2) r<-cor(rank(fit.lm.loz),rank(abs(res.lm.loz))) title(paste("Linear with transformation r=",round(r,2))) plot(fit.am.oz,abs(res.am.oz)) abline(h=0,lty=2) r<-cor(rank(fit.am.oz),rank(abs(res.am.oz))) title(paste("Additive without transformation r=",round(r,2))) plot(fit.am.loz,abs(res.am.loz)) abline(h=0,lty=2) r<-cor(rank(fit.am.loz),rank(abs(res.am.loz))) title(paste("Additive with transformation r=",round(r,2))) # Konstanz der Varianz ist besser für die "additiven Modelle", aber nicht # unbedingt für die Log-Transformation. # Teilaufgabe e) # Man kann diese Aufgabe entweder mit ozone oder logozone als Zielvariable # durchführen # Zuerst die 8 Modelle mit ozone: am.oz0<-gam(formula = ozone ~ s(radiation)+s(temperature)+s(wind),data=d.air) r2<-cor(am.oz0$fitted.values,am.oz0$fitted.values+am.oz0$residuals)^2 print(summary(am.oz0)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.oz1<-gam(formula = ozone ~ radiation+s(temperature)+s(wind),data=d.air) r2<-cor(am.oz1$fitted.values,am.oz1$fitted.values+am.oz1$residuals)^2 print(summary(am.oz1)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.oz2<-gam(formula = ozone ~ s(radiation)+temperature+s(wind),data=d.air) r2<-cor(am.oz2$fitted.values,am.oz2$fitted.values+am.oz2$residuals)^2 print(summary(am.oz2)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.oz3<-gam(formula = ozone ~ s(radiation)+s(temperature)+wind,data=d.air) r2<-cor(am.oz3$fitted.values,am.oz3$fitted.values+am.oz3$residuals)^2 print(summary(am.oz3)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.oz4<-gam(formula = ozone ~ radiation+temperature+s(wind),data=d.air) r2<-cor(am.oz4$fitted.values,am.oz4$fitted.values+am.oz4$residuals)^2 print(summary(am.oz4)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.oz5<-gam(formula = ozone ~ radiation+s(temperature)+wind,data=d.air) r2<-cor(am.oz5$fitted.values,am.oz5$fitted.values+am.oz5$residuals)^2 print(summary(am.oz5)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.oz6<-gam(formula = ozone ~ s(radiation)+temperature+wind,data=d.air) r2<-cor(am.oz6$fitted.values,am.oz6$fitted.values+am.oz6$residuals)^2 print(summary(am.oz6)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.oz7<-gam(formula = ozone ~ radiation+temperature+wind,data=d.air) drop1(am.oz7) r2<-cor(am.oz7$fitted.values,am.oz7$fitted.values+am.oz7$residuals)^2 print(summary(am.oz7)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") # Dasselbe mit logozone als Zielvariable: am.loz0<-gam(formula = d.logozone ~ s(radiation)+s(temperature)+s(wind),data=d.air) r2<-cor(am.loz0$fitted.values,am.loz0$fitted.values+am.loz0$residuals)^2 print(summary(am.loz0)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.loz1<-gam(formula = d.logozone ~ radiation+s(temperature)+s(wind),data=d.air) r2<-cor(am.loz1$fitted.values,am.loz1$fitted.values+am.loz1$residuals)^2 print(summary(am.loz1)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.loz2<-gam(formula = d.logozone ~ s(radiation)+temperature+s(wind),data=d.air) r2<-cor(am.loz2$fitted.values,am.loz2$fitted.values+am.loz2$residuals)^2 print(summary(am.loz2)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.loz3<-gam(formula = d.logozone ~ s(radiation)+s(temperature)+wind,data=d.air) r2<-cor(am.loz3$fitted.values,am.loz3$fitted.values+am.loz3$residuals)^2 print(summary(am.loz3)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.loz4<-gam(formula = d.logozone ~ radiation+temperature+s(wind),data=d.air) r2<-cor(am.loz4$fitted.values,am.loz4$fitted.values+am.loz4$residuals)^2 print(summary(am.loz4)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.loz5<-gam(formula = d.logozone ~ radiation+s(temperature)+wind,data=d.air) r2<-cor(am.loz5$fitted.values,am.loz5$fitted.values+am.loz5$residuals)^2 print(summary(am.loz5)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.loz6<-gam(formula = d.logozone ~ s(radiation)+temperature+wind,data=d.air) r2<-cor(am.loz6$fitted.values,am.loz6$fitted.values+am.loz6$residuals)^2 print(summary(am.loz6)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") am.loz7<-gam(formula = d.logozone ~ radiation+temperature+wind,data=d.air) r2<-cor(am.loz7$fitted.values,am.loz7$fitted.values+am.loz7$residuals)^2 print(summary(am.loz7)) cat("R2=",round(r2,3),"\n") cat("-------------------------------------------------------------","\n") #1234567890123456789012345678901234567890123456789012345678901234567890123456789 #Ruth Meili # Aufgabe 2 von Serie 3 # Teilaufgabe a) # dAten einlesen cantmp<-read.table("http://stat.ethz.ch/Teaching/Datasets/NDK/temperatur2.dat") n <-dim(cantmp)[1] ind <-seq(.5,11.5,1) lab <- c("St. John_s", "Charlottetown", "Halifax" , "Sydney", "Yarmouth", "Fredericton", "Arvida", "Montreal", "Quebec City", "Schefferville", "Sherbrooke", "Kapuskasing", "London", "Ottawa", "Thunder Bay", "Toronto", "Churchill", "The Pas", "Winnipeg", "Prince Albert", "Regina", "Beaverlodge", "Calgary", "Edmonton", "Kamloops", "Prince George", "Prince Rupert", "Vancouver", "Victoria", "Dawson", "Whitehorse", "Frobisher Bay", "Inuvik", "Resolute", "Yellowknife") row.names(cantmp) <- lab # Graphische Darstelllung der Daten: plot(0,0,xlim=c(0,12),ylim=c(min(cantmp),max(cantmp)),type="n",xlab="month",ylab="temperature") for(i in 1:n) points(ind,cantmp[i,],pch=i) for(i in 1:n) lines(ind,cantmp[i,],lty=2) # Teilaufgabe b) grid<-seq(0,12,0.1) # Definition des feineren Gitters N<-length(grid) # Kurvenglättung mit glkerns: library(lokern) fit<-matrix(0,ncol=N,nrow=n) for(i in 1:n) fit[i,]<-glkerns(ind,cantmp[i,],x.out=grid)$est par(mfrow=c(1,1)) plot(0,0,xlim=c(0,12),ylim=c(min(cantmp),max(cantmp)),type="n",xlab="month",ylab="temperature") for(i in 1:n) lines(grid,fit[i,],lty=i) # Hauptkomponentenanalyse mit princomp() library(mva) t.hk <- princomp(fit,cor=F) str(t.hk$loadings) t.hk$loadings[,1] # Darstellung der Loading-Funktionen: par(mfrow=c(2,2)) plot(grid,t.hk$loadings[,1]) plot(grid,t.hk$loadings[,2]) plot(grid,t.hk$loadings[,3]) plot(grid,t.hk$loadings[,4]) var.expl <- t.hk$sdev[1:4]^2/sum(t.hk$sdev^2) # Erklärte Varianz screeplot(t.hk,npcs=4) # Plots der erklärten Varianz #-------------------------------------------------------------------- # Alternativ: # Haupt-Komponenten berechnen als Eigenvektoren der Kovarianz-Matrix: v<-eigen(var(fit))[[1]] # Eigenwerte P<-eigen(var(fit))[[2]] # Eigenvektoren # Darstellung der Loading-Funktionen: par(mfrow=c(2,2)) plot(grid,P[,1])# Erster Eigenvektor abline(h=0) plot(grid,P[,2])# zweiter Eigenvektor abline(h=0) plot(grid,P[,3])# dritter Eigenvektor abline(h=0) plot(grid,P[,4]) # Vierter Eigenvektor abline(h=0) locator(1) # Varianz-Anteil fuer die ersten 4 Haupt-Komponenten: cat("-------------------------------------------------------","\n") cat("Percentage of variance of principal components:","\n") print(round(v[1:4]/sum(v),3)) # Teilaufgabe d) # Varimax rotation: Q<-varimax(P[,1:4])[[1]] # analog in der princomp-Variante: Q2 <- varimax(-t.hk$loadings[,1:4])[[1]] #Vorzeichen sind bei princomp- und Handberechnung umgekehrt. for(i in 1:4) if(sum(Q[,i])<0) Q[,i]<--Q[,i] # Loading-Funktionen fuer Varimax: par(mfrow=c(2,2)) plot(grid,Q[,1]) abline(h=0) plot(grid,Q[,2]) abline(h=0) plot(grid,Q[,3]) abline(h=0) plot(grid,Q[,4]) abline(h=0) locator(1) # Teilaufgabe e) # Daten zentrieren: fitcent<-scale(fit,scale=F) # Scores bezueglich Hauptkomponenten: Z<-fitcent%*%P # Alternativ mit princomp-Methode: Z2 <- fitcent%*%t.hk$loadings # Scores bezueglich varimax: V<-fitcent%*%Q # Alternativ mit princomp-Methode: V2 <- fitcent%*%Q2 # Varianz von Varimax Komponenten: cat("Percentage of variance of rotated principal components:","\n") print(round(diag(var(V))/sum(v),3)) barplot(round(diag(var(V))/sum(v),3),xlab="Varianzkomponenten",ylab="Varianzanteil") # Plots der Scores bezueglich den ersten zwei Hauptkomponenten: par(mfrow=c(1,1)) plot(Z[,1],Z[,2],type="n") text(Z[,1],Z[,2],row.names(cantmp)) # Alternativ: Biplot biplot(t.hk,choices=1:2,scale=1,xlabs=rownames(cantmp),expand=0) # expand=0 bedeutet, dass die Pfeile der Variablen nicht gezeichnet werden # Plots der Scores bezüglich der ersten und dritten Varimax-Komponenten: par(mfrow=c(1,1)) plot(V[,1],V[,3],type="n") text(V[,1],V[,3],row.names(cantmp))