R: Analysis of precipitation trends in the US

5 November, 2010 (15:23) | R, Spatial analysis | By: Manuel Gimond

Data files used with the following script were downloaded from: ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/.

# Author: Manuel Gimond
# Date: 10/29/2010
#
# Purpose: Extract data from DAILY GLOBAL HISTORICAL CLIMATOLOGY NETWORK for
# a select time period. Only stations that satisfy the minimum NA count threshold
# for the time period are kept.
# The script converts the aggregated monthly data to z values, then generates trend
# values for each station using linear regression model. The original inspiration
# for this script was to reproduce fig.1 of Balling and Goodrich's "Spatial analysis
# of variations in precipitation intensity in the USA" (Theor Appl Climatol, Oct 2010)
# Libraries
# library(MASS) # Used if robust regression is chosen
#
# User variables
my.path  = "ghcnd_hcn" # Directory where raw files are stored
file.ext = ".dly" # Extension of filenames
met.par  = "PRCP" # Meteorological paramater to extract
met.conv = 0.1 # Factor to convert raw number to real number (e.g. precip = 0.1)
na.cap   = 0.05 # Percent of no data accepted for analysis (stored as a fraction)
yr.st    = 1975 # Start year (inclusive)
yr.en    = 2009 # End year (inclusive)
robust.z = F # Use robust z scores? True (T) or False (F)
#
######################### Functions #########################################
# FUNCTION: Computes standardized z values using conventional or robust methods
# Note: all data values are converted including year
# Table format should follow:
# |Year,sta1,sta2,sta3,...|
Zval   = function(x.in,Robust) {
x.out  = x.in # Copy input data frame to output data frame
x.rows = nrow(x.in)
x.cols = ncol(x.in)
#
if(Robust){ # quartile-based estimators
  m.x   = apply(x.in, 2, median, na.rm = T)
  iqr.x = apply(x.in, 2, IQR, na.rm = T) # Normalized IQR
  s.x   = iqr.x/(qnorm(0.75) - qnorm(0.25))
}else{ # moments-based estimators
  m.x   = apply (x.in, 2, mean, na.rm=T)
  s.x   = apply (x.in, 2, sd, na.rm=T)
}
#
for (i in 1:x.cols) {
 if ( s.x[i] != 0){
   x.out[,i] = (x.in [,i] - m.x[i])/s.x[i] # z-score
 }else{
   x.out[,i] = 0
 }
}
return(x.out)
}
#
# FUNCTION: Computes linear trend for each station across
# entire time period. Appends coefficients to the dataframe.
# Choice is between robust regression (Robust=T) or non-robust
# regression (Robust=F).
# Table format should follow:
# |Year,sta1,sta2,sta3,...|
StaReg = function(x,Robust) {
a = "intercept"
b = "slope"
R = "adj_R2" # Adjusted R2
sta.lst = names(x)
for (i in 2:length(sta.lst)){
 if(Robust){
   lm.res = rlm(get(sta.lst[i]) ~ get(sta.lst[1]), data=x, na.action=na.omit)
 }else{
   lm.res = lm(get(sta.lst[i]) ~ get(sta.lst[1]), data=x, na.action=na.omit)
 }
 a[i] = lm.res$coefficients[1] # Intercept
 b[i] = lm.res$coefficients[2] # Slope
 R[i] = summary(lm.res)$adj.r.squared
}
# Append coefficients to existing table
x = rbind(x,a,b,R)
return(x)
}
######################### END of Functions #########################################
#
# Internal variables
file.stat = data.frame(name = numeric(0), expected = numeric(0),values = numeric(0),
            null = numeric(0), frac=numeric(0)) # Create an empty list to hold data stats
file.lst = list.files(path=my.path,pattern=file.ext) # Get list of files
yr.days  = (yr.en - yr.st +1) * 12 * 31 # Number of expected records for each station
#
# Internal variables specific to precipiation data
all.Total = as.data.frame(seq(yr.st,yr.en,1))
colnames(all.Total) = "Year"
all.INTNS = as.data.frame(seq(yr.st,yr.en,1))
colnames(all.INTNS) = "Year"
all.N50  = as.data.frame(seq(yr.st,yr.en,1))
colnames(all.N50) = "Year"
#
for (i in 1:length(file.lst)){
#for (i in 1:30){
print( c("Processing",file.lst[i]))
file.stat[i,1] = file.lst[i]
# Read data into columns (the '-1' value removes the flags)
sub1 = read.fwf(paste(my.path,'/',file.lst[i],sep=""),na.strings = "-9999",
                 width=c(11,4,2,4,rep(c(5,-1,-1,-1),31)))
# Grab only meteo parameter of interest
met.data = sub1[sub1$V4==met.par,]
# Grab only years of interest
met.data = met.data[met.data$V2>=yr.st & met.data$V2<=yr.en,]
# Compute the percent NA (make sure to ignore the header columns -> e.g. use seq(-1...))
# Note that for months with fewer than 31 days, missing days are also counted as NA. This
# should not influence the results significantly.
#
met.data.na = sum(as.numeric(is.na(met.data[,seq(-1,-4,-1)]))) # Total NA
met.data.num = sum(as.numeric(abs(is.na(met.data[,seq(-1,-4,-1)])-1)))
# Some files have missing dates; we need to add the missing data to NA
met.data.na = (yr.days - (met.data.num + met.data.na)) + met.data.na
met.data.na.frac = met.data.na / yr.days
# Output file stats to list
file.stat[i,] = c(file.lst[i],yr.days,met.data.num,met.data.na,met.data.na.frac)
if (met.data.na.frac <= na.cap) {
met.data.conv = cbind(met.data[2],met.data[3],met.data[,seq(-1,-4,-1)] * met.conv)
                       * met.conv # Apply conversion factor to raw data & remove superfluous columns
# The following is parameter specific (precipitation in this example)
# Note the use of 'seq' (e.g. [,seq(-1,-2,-1)]) to exclude columns (e.g. 1 through 2)
met.data.conv = cbind(met.data[2],met.data[3],met.data[,seq(-1,-4,-1)] * met.conv)
prec.cnt.mon  = cbind(met.data.conv[1],met.data.conv[2],
                      rowSums((met.data.conv[,seq(-1,-2,-1)] > 0),na.rm=T))
prec.cnt.yr = aggregate(prec.cnt.mon[,seq(-1,-2,-1)], by = prec.cnt.mon[1],
                         FUN = "sum") # total yearly count of rain events
prec.tot.mon = cbind(met.data.conv[1],met.data.conv[2],rowSums(met.data.conv[,seq(-1,-2,-1)],na.rm=T))
prec.tot.yr = aggregate(prec.tot.mon[,seq(-1,-2,-1)], by = prec.tot.mon[1], FUN = "sum")
prec.INTNS.yr = cbind(prec.tot.yr[1], prec.tot.yr[2] / prec.cnt.yr[2])
prec.N50.mon = cbind(met.data.conv[1],met.data.conv[2],
                      rowSums((met.data.conv[,seq(-1,-2,-1)] >= 50),na.rm=T))
prec.N50.yr = aggregate(prec.N50.mon[,seq(-1,-2,-1)], by = prec.N50.mon[1],
               FUN = "sum") # total yearly count of rain events above 50mm
prec.Total.yr = prec.tot.yr
colnames(prec.Total.yr) = c("Year",gsub(file.ext,"",file.lst[i]))
colnames(prec.INTNS.yr) = c("Year",gsub(file.ext,"",file.lst[i]))
colnames(prec.N50.yr)   = c("Year",gsub(file.ext,"",file.lst[i]))
all.Total = merge(all.Total, prec.Total.yr, by="Year", all=T)
all.INTNS = merge(all.INTNS, prec.INTNS.yr, by="Year", all=T)
all.N50   = merge(all.N50, prec.N50.yr, by="Year", all=T)
}
}
#
# Convert rain indices to z-scores (normalized value) for each station (i.e. by column)
all.Total.z = Zval(all.Total,robust.z)
all.INTNS.z = Zval(all.INTNS,robust.z)
all.N50.z   = Zval(all.N50,robust.z)
#
# Compute temporal trend for each station
all.Total.coef   = StaReg(all.Total,F)
all.Total.z.coef = StaReg(all.Total.z,F)
all.INTNS.z.coef = StaReg(all.INTNS.z,F)
all.N50.z.coef   = StaReg(all.N50.z,F)
#
# Rename the first column (Year) to original value if it was converted to a z-score
all.Total.z.coef[,1] = all.Total.coef[,1]
all.INTNS.z.coef[,1] = all.Total.coef[,1]
all.N50.z.coef[,1]   = all.Total.coef[,1]
#
# Transpose new data subset so that the stations are listed as records
all.Total.coef.t   = t(all.Total.coef)
all.Total.z.coef.t = t(all.Total.z.coef)
all.INTNS.t = t(all.INTNS)
all.INTNS.z.coef.t = t(all.INTNS.z.coef)
all.N50.t = t(all.N50)
all.N50.z.coef.t  = t(all.N50.z.coef)
#
# Rename header name and remove redundant header
colnames(all.Total.coef.t) = all.Total.coef.t[1,]
colnames(all.Total.z.coef.t) = all.Total.z.coef.t[1,]
colnames(all.INTNS.t) = all.INTNS.t[1,]
colnames(all.INTNS.z.coef.t) = all.INTNS.z.coef.t[1,]
colnames(all.N50.t) = all.N50.t[1,]
colnames(all.N50.z.coef.t) = all.N50.z.coef.t[1,]
all.Total.coef.t = data.frame(all.Total.coef.t)[-1,]
all.Total.z.coef.t = data.frame(all.Total.z.coef.t)[-1,]
all.INTNS.t = data.frame(all.INTNS.t)[-1,]
all.INTNS.z.coef.t = data.frame(all.INTNS.z.coef.t)[-1,]
all.N50.t = data.frame(all.N50.t)[-1,]
all.N50.z.coef.t = data.frame(all.N50.z.coef.t)[-1,]
#
# Export to CSV file
write.table(all.Total.coef.t,file="precip_Total_coef.csv", sep=",",
            quote=F,col.names=NA,row.names=T,na="") # Set NA to blank
write.table(all.Total.z.coef.t,file="precip_Total_z.csv", sep=",",
            quote=F,col.names=NA,row.names=T,na="") # Set NA to blank


The following figure shows the standardized regression coefficients from a Universal Kriging analysis of the Precip_total_coef_all_z.csv file (the values are in fact correlation coefficients). The time period spans 1975 – 2009.