衣带渐宽终不悔分享 http://blog.sciencenet.cn/u/tuqiang2014 在康河的柔波里,做一条水草, 向青草更青处漫溯。

博文

环境与生态统计||探索性数据分析

已有 1022 次阅读 2019-1-12 11:07 |系统分类:科研笔记

rm(list=ls())
source("D:\\Users\\Administrator\\Desktop\\RStudio\\Environmental_and_Ecological_Statistics_with_R\\DataCode\\Code\\FrontMatter.R")

##################
### Chapter 3  ###
##################
plotDIRch3 <- paste (plotDIR, "chapter3", "figures", sep="/")
if(file_test("-d", plotDIRch3)){cat("plotDIRch3 already exists")}else{dir.create(plotDIRch3,recursive = TRUE)}

##### Everglades reference sites
wca2tp <- read.csv(paste(dataDIR, "WCA2TP.csv", sep="/"), header=T)
wca2tp$TP <- 1000*wca2tp$RESULT
TP.reference <- wca2tp[wca2tp$Type=="R",]
TP.reference$SITE <- as.vector( TP.reference$SITE)
TP.reference$Month <- ordered(months(as.Date(TP.reference$Date, "%m/%d/%y"), T),
  levels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"))
展示分布图形
par(mfrow=c(1,2), mar=c(2.5,2.5,.5,1), mgp=c(1.25, 0.125, 0), las=1, tck=0.01)
hist(log(TP.reference$TP), dens=-1, xlab="TP (ppb)",
     ylab="", main="", nclass=20)
hist(log(TP.reference$TP), dens=-1, xlab="TP (ppb)",
     ylab="", main="", nclass=10)

直方图会受分组个数的影响。

分位数图
TP.example<- c(0.21, 0.35, 0.50, 0.64, 0.79,0.90, 1.00, 1.01, 1.12, 5.66)
#postscript(file=paste(plotDIR,"evergQplot.eps", sep="/"),
#           height=2, width=3, horizontal=F)
#tikz(file=paste(plotDIRch3,"evergQplot.tex", sep="/"),
#           height=2, width=3, standAlone=F)
par(mar=c(2.5,2.5,.5,1), mgp=c(1.25, 0.125, 0), las=1, tck=0.01)
plot((1:length(TP.example) -0.5)/length(TP.example), TP.example, xlab="f", ylab="TP (ppb)")

Tukey 的箱线图
## Figure \ref{fig:clevelandBox} 3.6
data <-
  c(0.9, 1.6, 2.26305, 2.55052, 2.61059, 2.69284, 2.78511, 2.80955,
    2.94647, 2.96043, 3.05728, 3.15748, 3.18033, 3.20021,
    3.20156, 3.24435, 3.33231, 3.34176, 3.3762, 3.39578, 3.4925,
    3.55195, 3.56207, 3.65149, 3.72746, 3.73338, 3.73869,
    3.80469, 3.85224, 3.91386, 3.93034, 4.02351, 4.03947,
    4.05481, 4.10111, 4.26249, 4.28782, 4.37586, 4.48811,
    4.6001, 4.65677, 4.66167, 4.73211, 4.80803, 4.9812, 5.17246,
    5.3156, 5.35086, 5.36848, 5.48167, 5.68, 5.98848, 6.2, 7.1,
    7.4)

## explaining the box plot
uq <- quantile(data,.75)
lq <- quantile(data,.25)
r <- 1.5*(uq-lq)
h <- c(lq-r,1.6,lq,uq,6.2,uq+r)
writing <- c("lower quartile - 1.5 r",
             "lower adjacent value",
             "lower quartile",
             "upper quartile",
             "upper adjacent value",
             "upper quartile + 1.5 r")

n <- length(data)

#postscript(file=paste(plotDIR, "explainBox.eps", sep="/"),
#           width=4.75, height=2.25, horizontal=F)
#tikz(file=paste(plotDIRch3, "explainBox.tex", sep="/"),
#          width=4.75, height=2.25, standAlone=F)
par(mfrow=c(1,2), mar=c(2.5,2.5,.25,0.125), mgp=c(1.25, 0.125, 0),
    las=1, tck=0.01)
            #layout(matrix(c(1,2), nrow=1), width=c(1,1.5))
boxplot(data, rep(NA, length(data)), rep(NA, length(data)), ylab = "Data")
usr <- par("usr")
x <- usr[1] + (usr[2] - usr[1]) * 1/3
at <- c(0.9, 1.6, 3.2, 3.8, 4.65, 6.2, 7.2)
arrows(rep(x * 1.15, 7), at, rep(x, 7), at, length=0.075)
mtext("Main Title",1,1,cex=.8)
text(rep(x * 1.2, 7), at, adj = 0, cex=0.7,
     labels = c("outside value", "lower adjacent value",
       "lower quartile", "median", "upper quartile",
       "upper adjacent value", "outside values"))

plot(((1:n)-0.5)/n, data, xlab="f-value", ylab="Data")
abline(h = h, lwd =2, col = gray(0.85))
text(rep(0,3), h[4:6], writing[4:6], adj=0, cex=0.75)
text(rep(1,3), h[1:3], writing[1:3], adj=1, cex=0.75)
points(((1:n)-0.5)/n, data)

比较分布的图形
## Figure \ref{fig:addVmlt} 3.7
### Q-Q plot for additive and multiplicative shifts
x.data <- rnorm(500)
y.data <- rnorm(500, 2)
#postscript(file=paste(plotDIR,"additive.eps", sep="/"),
#           height=4, width=3.5, horizontal=F)
#tikz(file=paste(plotDIRch3,"additive.tex", sep="/"),
#     height=4, width=3.5, standAlone=F)
layout(matrix(c(1,3,2,3), nrow=2), heights=c(1,2), respect=T)
par(mar=c(2.5,2.5,.5,0.5), mgp=c(1.25, 0.125, 0), las=1, tck=0.01)
hist(x.data, xlim=range(c(x.data,y.data)), xlab="x", ylab="", main="", prob=T)
curve(dnorm(x), add=T, from=-3, to=3)
hist(y.data, xlim=range(c(x.data,y.data)), xlab="y", ylab="", main="", prob=T)
curve(dnorm(x, mean=2), add=T, from=-1, to=5)
qqplot(x.data, y.data, xlim=range(c(x.data,y.data)),
       ylim=range(c(x.data,y.data)), xlab="x", ylab="y")
abline(0,1, col=gray(0.5))
abline(2,1, col=gray(0.5))

x.data2 <- exp(rnorm(100))
y.data2 <- exp(rnorm(100, 1))

#postscript(file=paste(plotDIR, "multiplicative.eps", sep="/"),
#           height=4, width=3.5, horizontal=F)
#tikz(file=paste(plotDIRch3, "multiplicative.tex", sep="/"),
#     height=4, width=3.5, standAlone=F)
layout(matrix(c(1,3,2,3), nrow=2), heights=c(1,2), respect=T)
par(mar=c(2.5,2.5,.5,0.5), mgp=c(1.25, 0.125, 0), las=1, tck=0.01)
hist(x.data2, xlim=range(c(x.data2,y.data2)),
     xlab="x", ylab="", main="", prob=T, nclass=10)
curve(dlnorm(x), add=T, from=exp(-3), to=exp(3))
hist(y.data2, xlim=range(c(x.data2,y.data2)),
     xlab="y", ylab="", main="", prob=T, nclass=10)
curve(dlnorm(x, mean=1), add=T, from=exp(-2), to=exp(4))
qqplot(log(x.data2), log(y.data2),  xlab="x", ylab="y")
abline(0,1, col=gray(0.5))

layout(rbind(c(1,2,0,4,5), c(3,3,0,6,6)), widths=c(1,1,lcm(0.5), 1,1),
       heights=c(1,2), respect=T)
par(mar=c(2.5,2.5,.5,0.5), mgp=c(1.25, 0.125, 0), las=1, tck=0.01)

hist(x.data, xlim=range(c(x.data,y.data)), xlab="x", ylab="", main="", prob=T)
curve(dnorm(x), add=T, from=-3, to=3)
hist(y.data, xlim=range(c(x.data,y.data)), xlab="y", ylab="", main="", prob=T)
curve(dnorm(x, mean=2), add=T, from=-1, to=5)
qqplot(x.data, y.data, xlim=range(c(x.data,y.data)),
       ylim=range(c(x.data,y.data)),xlab="x", ylab="y")
abline(0,1, col=gray(0.5))
abline(2,1, col=gray(0.5))

hist(x.data2, xlim=range(c(x.data2,y.data2)),
     xlab="x", ylab="", main="", prob=T, nclass=10)
curve(dlnorm(x), add=T, from=exp(-3), to=exp(3))
hist(y.data2, xlim=range(c(x.data2,y.data2)),
     xlab="y", ylab="", main="", prob=T, nclass=10)
curve(dlnorm(x, mean=1), add=T, from=exp(-2), to=exp(4))
qqplot(x.data2, y.data2, xlim=range(c(x.data2,y.data2)),
       ylim=range(c(x.data2,y.data2)), xlab="x", ylab="y")
abline(0,1, col=gray(0.5))

识别变量间的依存关系的图形
## Car.test.frame in package rpart
packages(rpart)
names(car.test.frame)
#postscript(file=paste(plotDIR, "carsTest.eps", sep="/"),
#           width=4.75, height=2, horizontal=F)
#tikz(file=paste(plotDIRch3, "carsTest.tex", sep="/"),
 #          width=4.75, height=2, standAlone=F)
par(mfrow=c(1,2), mar=c(2.5,2.5,.5,0.5),
    mgp=c(1.25, 0.125, 0), las=1, tck=0.01)
plot(Mileage ~ Weight, data=car.test.frame,
     xlab="Weight (lb)", ylab="Mileage (mpg)")
abline(lsfit(car.test.frame$Weight, car.test.frame$Mileage))

plot(1/Mileage ~ Weight, data=car.test.frame,
     xlab="Weight (lb)", ylab="Mileage (mpg)")
abline(lsfit(car.test.frame$Weight, car.test.frame$Mileage),
       col=gray(0.5), lty=2)
cars.lo <- loess(I(1/Mileage) ~ Weight, data = car.test.frame)
oo <- order(cars.lo$x)
lines(cars.lo$x[oo], cars.lo$fitted[oo])

散点图矩阵
## Car.test.frame in package rpart

panel.hist = function(x, col.hist="grey", ...)
  {
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1],y, col=col.hist)
  }

#postscript(file=paste(plotDIR, "airquality.eps", sep="/"),
#           height=4.5, width=4.5, horizontal=F)
#tikz(file=paste(plotDIRch3, "airquality.tex", sep="/"),
#           height=4.5, width=4.5, standAlone=F)
pairs(Ozone~Solar.R+Wind+Temp, data=airquality,
      panel=function(x, y, ...){
        points(x, y,  ...)
        panel.smooth(x, y, col.smooth=1, ...)
      }, col=gray(0.5), cex=0.5, diag.panel=panel.hist)

### the iris data
Snames <- dimnames(iris3)[[3]]
iris.df <- rbind(iris3[,,1],iris3[,,2],iris3[,,3])
iris.df <- as.data.frame(iris.df)
iris.df$Species <- factor(rep(Snames,rep(50,3)))
#pairs(iris.df[1:4],main = "Anderson?s Iris Data",
#pch = c("<", "+", ">")[unclass(iris$Species)],
#col = c(gray(0.25),gray(0.5),gray(0.25))[unclass(iris$Species)])

#postscript(file=paste(plotDIR, "irisPairs.eps", sep="/"),
#           height=4.5, width=4.5, horizontal=F)
#tikz(file=paste(plotDIRch3, "irisPairs.tex", sep="/"),
 #          height=4.5, width=4.5, standAlone=F)
pairs(iris.df[1:4],main = "The Iris Data",
      pch = c(2, 4, 6)[unclass(iris$Species)],
      col = c("#BFBFBF", "#808080", "#404040")[unclass(iris$Species)])

变量转换
nadb <- read.csv(paste(dataDIR, "nadb.csv", sep="/"), header=T)
par(mfrow=c(1,2), mar=c(2.5,2.5,.5,0.5), mgp=c(1.5, 0.5, 0))
plot(TPOut ~ PLI, data=nadb,
     xlab="P Loading", ylab="P Concentration", cex=0.5)
plot(TPOut ~ PLI, data=nadb,
     xlab="P Loading", ylab="P Concentration", log="x", axes=F, cex=0.5)
axis(1, at=c(0.01, 1, 100), label=c("0.01", "1", "100"))
axis(2)
box()

  • 幂转换

    powerT <- function(y, lambda1=c(-1,-1/2,-1/4), lambda2 = c(1/4, 1/2, 1), layout1=2){
    nt <- length(lambda1)+length(lambda2)+1
    transformed <- cbind(outer(y,lambda1,"^"),log(y),(outer(y,lambda2,"^")))
    y.power <- data.frame(transformed=c(transformed),
                          lambda = factor(rep(round(c(lambda1,0,lambda2), 2),
                            rep(length(y),nt))))
    ans <- qqmath(~transformed | lambda,
                  data=y.power,
                  prepanel = prepanel.qqmathline,
                  panel = function(x, ...) {
                    panel.grid(h = 0)
                    panel.qqmath(x, col=gray(0.5))
                    panel.qqmathline(x, distribution = qnorm)
                  }, aspect=1, scale = list(y = "free"),
                  layout=c(layout1,ceiling(nt/layout1)),
                  xlab = "Unit Normal Quantile",
                  ylab = "y")
    ans
    }
    trellis.par.set(theme = canonical.theme("postscript", col=FALSE))
    powerT(y=nadb$PLI)

  • 双变量散点图

pmdata<-read.table(paste(dataDIR, "PM-RAW-DATA.txt", sep="/"), header=T)
pmdata$log.value<-log(pmdata$value)
xyplot(log.value~AvgTemp, panel=function(x,y,...){
  panel.grid()
  panel.xyplot(x,y, col=grey(0.65), cex=0.5, ...)
  panel.loess(x,y,span=1, degree=1,col=1,...)
}, scales=list(x=list(cex=0.75), y=list(cex=0.75)),
       par.settings=trellis.par.temp,
       data=pmdata, xlab="Average Daily Temperature (F)", ylab="Log PM2.5")

## traditional coplot
coplot(sqrt(Ozone) ~ Solar.R|Wind*Temp , data=airquality,
       given.values=list(co.intervals(airquality$Wind, 3, 0.25),
         co.intervals(airquality$Temp, 3, 0.25)),
       panel=function(x, y, ...)
       panel.smooth(x, y, span=1, ...),
       ylab="Log Ozone",
       xlab=c("Solar Radiation", "Wind","Temperature"))

WindSpeed <- equal.count(airquality$Wind, 3, 0.25)
Temperature <- equal.count(airquality$Temp, 3, 0.25)
SolarR <- equal.count(airquality$Solar.R, 3, 0.25)

print(
xyplot(sqrt(Ozone) ~ SolarR|WindSpeed*Temperature,
       data=airquality,
       panel=function(x,y,...){
#            panel.loess(x, y, span=1, degree=1, ...)
            panel.grid()
            panel.lmline(x, y, col="grey",...)
            panel.xyplot(x, y, col=1, cex=0.5, ...)
       },
       ylab=list(label="Sqrt Ozone", cex=0.6),
       xlab=list(label="Solar Radiation", cex=0.6),
       scales=list(x=list(alternating=c(1, 2, 1))),
#       between=list(y=1),
       par.strip.text=list(cex=0.4), aspect=1,
       par.settings=list(axis.text=list(cex=0.4)))
)

print(
      xyplot(sqrt(Ozone) ~ Wind|SolarR*Temperature,
             data=airquality,
             panel=function(x,y,...){
#            panel.loess(x, y, span=1, degree=1, ...)
               panel.grid()
               panel.lmline(x, y, col="grey",...)
               panel.xyplot(x, y, col=1, cex=0.5, ...)
             },
             ylab=list(label="Sqrt Ozone", cex=0.6),
             xlab=list(label="Wind Speed", cex=0.6),
             scales=list(x=list(alternating=c(1, 2, 1))),
                                        #       between=list(y=1),
             par.strip.text=list(cex=0.4), aspect=1,
             par.settings=list(axis.text=list(cex=0.4)))
      )

print(
xyplot(sqrt(Ozone) ~ Temp|WindSpeed*SolarR,
       data=airquality,
       panel=function(x,y,...){
#            panel.loess(x, y, span=1, degree=1, ...)
            panel.grid()
            panel.lmline(x, y, col="grey",...)
            panel.xyplot(x, y, col=1, cex=0.5, ...)
       },
       ylab=list(label="Sqrt Ozone", cex=0.6),
       xlab=list(label="Temperature", cex=0.6),
       scales=list(x=list(alternating=c(1, 2, 1))),
#       between=list(y=1),
       par.strip.text=list(cex=0.4), aspect=1,
       par.settings=list(axis.text=list(cex=0.4)))
)




http://blog.sciencenet.cn/blog-1835014-1156645.html

上一篇:环境与生态统计||置信区间的估计
下一篇:环境与生态统计||探索性数据可视化

0

该博文允许注册用户评论 请点击登录 评论 (0 个评论)

数据加载中...

Archiver|手机版|科学网 ( 京ICP备14006957 )

GMT+8, 2020-2-19 14:18

Powered by ScienceNet.cn

Copyright © 2007- 中国科学报社

返回顶部