# 环境与生态统计||探索性数据分析

rm(list=ls())

##################
### Chapter 3  ###
##################
plotDIRch3 <- paste (plotDIR, "chapter3", "figures", sep="/")
if(file_test("-d", plotDIRch3)){cat("plotDIRch3 already exists")}else{dir.create(plotDIRch3,recursive = TRUE)}

wca2tp$TP <- 1000*wca2tp$RESULT
TP.reference <- wca2tp[wca2tp$Type=="R",] TP.reference$SITE <- as.vector( TP.reference$SITE) TP.reference$Month <- ordered(months(as.Date(TP.reference$Date, "%m/%d/%y"), T), levels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")) ###### 展示分布图形 par(mfrow=c(1,2), mar=c(2.5,2.5,.5,1), mgp=c(1.25, 0.125, 0), las=1, tck=0.01) hist(log(TP.reference$TP), dens=-1, xlab="TP (ppb)",
ylab="", main="", nclass=20)
hist(log(TP.reference$TP), dens=-1, xlab="TP (ppb)", ylab="", main="", nclass=10) 直方图会受分组个数的影响。 ###### 分位数图 TP.example<- c(0.21, 0.35, 0.50, 0.64, 0.79,0.90, 1.00, 1.01, 1.12, 5.66) #postscript(file=paste(plotDIR,"evergQplot.eps", sep="/"), # height=2, width=3, horizontal=F) #tikz(file=paste(plotDIRch3,"evergQplot.tex", sep="/"), # height=2, width=3, standAlone=F) par(mar=c(2.5,2.5,.5,1), mgp=c(1.25, 0.125, 0), las=1, tck=0.01) plot((1:length(TP.example) -0.5)/length(TP.example), TP.example, xlab="f", ylab="TP (ppb)") ###### Tukey 的箱线图 ## Figure \ref{fig:clevelandBox} 3.6 data <- c(0.9, 1.6, 2.26305, 2.55052, 2.61059, 2.69284, 2.78511, 2.80955, 2.94647, 2.96043, 3.05728, 3.15748, 3.18033, 3.20021, 3.20156, 3.24435, 3.33231, 3.34176, 3.3762, 3.39578, 3.4925, 3.55195, 3.56207, 3.65149, 3.72746, 3.73338, 3.73869, 3.80469, 3.85224, 3.91386, 3.93034, 4.02351, 4.03947, 4.05481, 4.10111, 4.26249, 4.28782, 4.37586, 4.48811, 4.6001, 4.65677, 4.66167, 4.73211, 4.80803, 4.9812, 5.17246, 5.3156, 5.35086, 5.36848, 5.48167, 5.68, 5.98848, 6.2, 7.1, 7.4) ## explaining the box plot uq <- quantile(data,.75) lq <- quantile(data,.25) r <- 1.5*(uq-lq) h <- c(lq-r,1.6,lq,uq,6.2,uq+r) writing <- c("lower quartile - 1.5 r", "lower adjacent value", "lower quartile", "upper quartile", "upper adjacent value", "upper quartile + 1.5 r") n <- length(data) #postscript(file=paste(plotDIR, "explainBox.eps", sep="/"), # width=4.75, height=2.25, horizontal=F) #tikz(file=paste(plotDIRch3, "explainBox.tex", sep="/"), # width=4.75, height=2.25, standAlone=F) par(mfrow=c(1,2), mar=c(2.5,2.5,.25,0.125), mgp=c(1.25, 0.125, 0), las=1, tck=0.01) #layout(matrix(c(1,2), nrow=1), width=c(1,1.5)) boxplot(data, rep(NA, length(data)), rep(NA, length(data)), ylab = "Data") usr <- par("usr") x <- usr[1] + (usr[2] - usr[1]) * 1/3 at <- c(0.9, 1.6, 3.2, 3.8, 4.65, 6.2, 7.2) arrows(rep(x * 1.15, 7), at, rep(x, 7), at, length=0.075) mtext("Main Title",1,1,cex=.8) text(rep(x * 1.2, 7), at, adj = 0, cex=0.7, labels = c("outside value", "lower adjacent value", "lower quartile", "median", "upper quartile", "upper adjacent value", "outside values")) plot(((1:n)-0.5)/n, data, xlab="f-value", ylab="Data") abline(h = h, lwd =2, col = gray(0.85)) text(rep(0,3), h[4:6], writing[4:6], adj=0, cex=0.75) text(rep(1,3), h[1:3], writing[1:3], adj=1, cex=0.75) points(((1:n)-0.5)/n, data) ###### 比较分布的图形 ## Figure \ref{fig:addVmlt} 3.7 ### Q-Q plot for additive and multiplicative shifts x.data <- rnorm(500) y.data <- rnorm(500, 2) #postscript(file=paste(plotDIR,"additive.eps", sep="/"), # height=4, width=3.5, horizontal=F) #tikz(file=paste(plotDIRch3,"additive.tex", sep="/"), # height=4, width=3.5, standAlone=F) layout(matrix(c(1,3,2,3), nrow=2), heights=c(1,2), respect=T) par(mar=c(2.5,2.5,.5,0.5), mgp=c(1.25, 0.125, 0), las=1, tck=0.01) hist(x.data, xlim=range(c(x.data,y.data)), xlab="x", ylab="", main="", prob=T) curve(dnorm(x), add=T, from=-3, to=3) hist(y.data, xlim=range(c(x.data,y.data)), xlab="y", ylab="", main="", prob=T) curve(dnorm(x, mean=2), add=T, from=-1, to=5) qqplot(x.data, y.data, xlim=range(c(x.data,y.data)), ylim=range(c(x.data,y.data)), xlab="x", ylab="y") abline(0,1, col=gray(0.5)) abline(2,1, col=gray(0.5)) x.data2 <- exp(rnorm(100)) y.data2 <- exp(rnorm(100, 1)) #postscript(file=paste(plotDIR, "multiplicative.eps", sep="/"), # height=4, width=3.5, horizontal=F) #tikz(file=paste(plotDIRch3, "multiplicative.tex", sep="/"), # height=4, width=3.5, standAlone=F) layout(matrix(c(1,3,2,3), nrow=2), heights=c(1,2), respect=T) par(mar=c(2.5,2.5,.5,0.5), mgp=c(1.25, 0.125, 0), las=1, tck=0.01) hist(x.data2, xlim=range(c(x.data2,y.data2)), xlab="x", ylab="", main="", prob=T, nclass=10) curve(dlnorm(x), add=T, from=exp(-3), to=exp(3)) hist(y.data2, xlim=range(c(x.data2,y.data2)), xlab="y", ylab="", main="", prob=T, nclass=10) curve(dlnorm(x, mean=1), add=T, from=exp(-2), to=exp(4)) qqplot(log(x.data2), log(y.data2), xlab="x", ylab="y") abline(0,1, col=gray(0.5)) layout(rbind(c(1,2,0,4,5), c(3,3,0,6,6)), widths=c(1,1,lcm(0.5), 1,1), heights=c(1,2), respect=T) par(mar=c(2.5,2.5,.5,0.5), mgp=c(1.25, 0.125, 0), las=1, tck=0.01) hist(x.data, xlim=range(c(x.data,y.data)), xlab="x", ylab="", main="", prob=T) curve(dnorm(x), add=T, from=-3, to=3) hist(y.data, xlim=range(c(x.data,y.data)), xlab="y", ylab="", main="", prob=T) curve(dnorm(x, mean=2), add=T, from=-1, to=5) qqplot(x.data, y.data, xlim=range(c(x.data,y.data)), ylim=range(c(x.data,y.data)),xlab="x", ylab="y") abline(0,1, col=gray(0.5)) abline(2,1, col=gray(0.5)) hist(x.data2, xlim=range(c(x.data2,y.data2)), xlab="x", ylab="", main="", prob=T, nclass=10) curve(dlnorm(x), add=T, from=exp(-3), to=exp(3)) hist(y.data2, xlim=range(c(x.data2,y.data2)), xlab="y", ylab="", main="", prob=T, nclass=10) curve(dlnorm(x, mean=1), add=T, from=exp(-2), to=exp(4)) qqplot(x.data2, y.data2, xlim=range(c(x.data2,y.data2)), ylim=range(c(x.data2,y.data2)), xlab="x", ylab="y") abline(0,1, col=gray(0.5)) ###### 识别变量间的依存关系的图形 ## Car.test.frame in package rpart packages(rpart) names(car.test.frame) #postscript(file=paste(plotDIR, "carsTest.eps", sep="/"), # width=4.75, height=2, horizontal=F) #tikz(file=paste(plotDIRch3, "carsTest.tex", sep="/"), # width=4.75, height=2, standAlone=F) par(mfrow=c(1,2), mar=c(2.5,2.5,.5,0.5), mgp=c(1.25, 0.125, 0), las=1, tck=0.01) plot(Mileage ~ Weight, data=car.test.frame, xlab="Weight (lb)", ylab="Mileage (mpg)") abline(lsfit(car.test.frame$Weight, car.test.frame$Mileage)) plot(1/Mileage ~ Weight, data=car.test.frame, xlab="Weight (lb)", ylab="Mileage (mpg)") abline(lsfit(car.test.frame$Weight, car.test.frame$Mileage), col=gray(0.5), lty=2) cars.lo <- loess(I(1/Mileage) ~ Weight, data = car.test.frame) oo <- order(cars.lo$x)
lines(cars.lo$x[oo], cars.lo$fitted[oo])

###### 散点图矩阵
## Car.test.frame in package rpart

panel.hist = function(x, col.hist="grey", ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1],y, col=col.hist)
}

#postscript(file=paste(plotDIR, "airquality.eps", sep="/"),
#           height=4.5, width=4.5, horizontal=F)
#tikz(file=paste(plotDIRch3, "airquality.tex", sep="/"),
#           height=4.5, width=4.5, standAlone=F)
pairs(Ozone~Solar.R+Wind+Temp, data=airquality,
panel=function(x, y, ...){
points(x, y,  ...)
panel.smooth(x, y, col.smooth=1, ...)
}, col=gray(0.5), cex=0.5, diag.panel=panel.hist)

### the iris data
Snames <- dimnames(iris3)[[3]]
iris.df <- rbind(iris3[,,1],iris3[,,2],iris3[,,3])
iris.df <- as.data.frame(iris.df)
iris.df$Species <- factor(rep(Snames,rep(50,3))) #pairs(iris.df[1:4],main = "Anderson?s Iris Data", #pch = c("<", "+", ">")[unclass(iris$Species)],
#col = c(gray(0.25),gray(0.5),gray(0.25))[unclass(iris$Species)]) #postscript(file=paste(plotDIR, "irisPairs.eps", sep="/"), # height=4.5, width=4.5, horizontal=F) #tikz(file=paste(plotDIRch3, "irisPairs.tex", sep="/"), # height=4.5, width=4.5, standAlone=F) pairs(iris.df[1:4],main = "The Iris Data", pch = c(2, 4, 6)[unclass(iris$Species)],
col = c("#BFBFBF", "#808080", "#404040")[unclass(iris$Species)]) ###### 变量转换 nadb <- read.csv(paste(dataDIR, "nadb.csv", sep="/"), header=T) par(mfrow=c(1,2), mar=c(2.5,2.5,.5,0.5), mgp=c(1.5, 0.5, 0)) plot(TPOut ~ PLI, data=nadb, xlab="P Loading", ylab="P Concentration", cex=0.5) plot(TPOut ~ PLI, data=nadb, xlab="P Loading", ylab="P Concentration", log="x", axes=F, cex=0.5) axis(1, at=c(0.01, 1, 100), label=c("0.01", "1", "100")) axis(2) box() • 幂转换 powerT <- function(y, lambda1=c(-1,-1/2,-1/4), lambda2 = c(1/4, 1/2, 1), layout1=2){ nt <- length(lambda1)+length(lambda2)+1 transformed <- cbind(outer(y,lambda1,"^"),log(y),(outer(y,lambda2,"^"))) y.power <- data.frame(transformed=c(transformed), lambda = factor(rep(round(c(lambda1,0,lambda2), 2), rep(length(y),nt)))) ans <- qqmath(~transformed | lambda, data=y.power, prepanel = prepanel.qqmathline, panel = function(x, ...) { panel.grid(h = 0) panel.qqmath(x, col=gray(0.5)) panel.qqmathline(x, distribution = qnorm) }, aspect=1, scale = list(y = "free"), layout=c(layout1,ceiling(nt/layout1)), xlab = "Unit Normal Quantile", ylab = "y") ans } trellis.par.set(theme = canonical.theme("postscript", col=FALSE)) powerT(y=nadb$PLI)

• 双变量散点图

pmdata<-read.table(paste(dataDIR, "PM-RAW-DATA.txt", sep="/"), header=T)
pmdata$log.value<-log(pmdata$value)
xyplot(log.value~AvgTemp, panel=function(x,y,...){
panel.grid()
panel.xyplot(x,y, col=grey(0.65), cex=0.5, ...)
panel.loess(x,y,span=1, degree=1,col=1,...)
}, scales=list(x=list(cex=0.75), y=list(cex=0.75)),
par.settings=trellis.par.temp,
data=pmdata, xlab="Average Daily Temperature (F)", ylab="Log PM2.5")

## traditional coplot
coplot(sqrt(Ozone) ~ Solar.R|Wind*Temp , data=airquality,
given.values=list(co.intervals(airquality$Wind, 3, 0.25), co.intervals(airquality$Temp, 3, 0.25)),
panel=function(x, y, ...)
panel.smooth(x, y, span=1, ...),
ylab="Log Ozone",

WindSpeed <- equal.count(airquality$Wind, 3, 0.25) Temperature <- equal.count(airquality$Temp, 3, 0.25)
SolarR <- equal.count(airquality\$Solar.R, 3, 0.25)

print(
xyplot(sqrt(Ozone) ~ SolarR|WindSpeed*Temperature,
data=airquality,
panel=function(x,y,...){
#            panel.loess(x, y, span=1, degree=1, ...)
panel.grid()
panel.lmline(x, y, col="grey",...)
panel.xyplot(x, y, col=1, cex=0.5, ...)
},
ylab=list(label="Sqrt Ozone", cex=0.6),
scales=list(x=list(alternating=c(1, 2, 1))),
#       between=list(y=1),
par.strip.text=list(cex=0.4), aspect=1,
par.settings=list(axis.text=list(cex=0.4)))
)

print(
xyplot(sqrt(Ozone) ~ Wind|SolarR*Temperature,
data=airquality,
panel=function(x,y,...){
#            panel.loess(x, y, span=1, degree=1, ...)
panel.grid()
panel.lmline(x, y, col="grey",...)
panel.xyplot(x, y, col=1, cex=0.5, ...)
},
ylab=list(label="Sqrt Ozone", cex=0.6),
xlab=list(label="Wind Speed", cex=0.6),
scales=list(x=list(alternating=c(1, 2, 1))),
#       between=list(y=1),
par.strip.text=list(cex=0.4), aspect=1,
par.settings=list(axis.text=list(cex=0.4)))
)

print(
xyplot(sqrt(Ozone) ~ Temp|WindSpeed*SolarR,
data=airquality,
panel=function(x,y,...){
#            panel.loess(x, y, span=1, degree=1, ...)
panel.grid()
panel.lmline(x, y, col="grey",...)
panel.xyplot(x, y, col=1, cex=0.5, ...)
},
ylab=list(label="Sqrt Ozone", cex=0.6),
xlab=list(label="Temperature", cex=0.6),
scales=list(x=list(alternating=c(1, 2, 1))),
#       between=list(y=1),
par.strip.text=list(cex=0.4), aspect=1,
par.settings=list(axis.text=list(cex=0.4)))
)

http://blog.sciencenet.cn/blog-1835014-1156645.html

## 全部精选博文导读

GMT+8, 2020-12-1 10:16