A basic transformation and regression example using Census data

9 April, 2012 (12:00) | R | By: Manuel Gimond

The data used in this example are 2010 ACS Census data of per capita income and fraction of population with a Bachelor’s degree or greater. The data file can be downloaded here.


# Before running the following lines, the user shoud run all
# functions found at the bottom of the page
#
# Read datafile
dat = read.csv("inc_ed.csv", header=T)
attach(dat)

# Generate a scatter plot of Y vs X where x and y scales
# are one SD unit in length. Plot the 45° SD line along
# with the mean vertical and horizontal lines
SDplot(percE_E,B19301_001E)

# Compute OLS
M1 = lm(B19301_001E ~ percE_E)

# Plot regression line
abline(M1,col="blue")

# Plot OLS results
OP = par(mfrow=c(2, 2))
plot(M1)
par(OP)

# The above assumes normally distributed data when in fact
# the data are not perfectly normal.
# Create a 'ladder' of transformations. The trans() function creates png files,
# so check the directory for image files after running trans().
trans(c("B19301_001E","percE_E"))

# Based on a visual analysis of the transformations,
# re-express the data as follows:
tY = BoxCox(B19301_001E,0.33)
tX = BoxCox(percE_E,0.5)

# Generate scatter plot of transformed data
SDplot(tX,tY)

# Rerun lm model with transformed data
M2 = lm(tY ~ tX)

# Plot regression line
abline(M2,col="blue")

# Plot lm diagnostic plots
OP = par(mfrow=c(2, 2))
plot(M2)
par(OP)

detach(dat)

######################################################################
##############
############## FUNCTIONS
##############
######################################################################
#
# ===================================================================
# ===== Standardization function
# ===================================================================
#
std = function(x,y) {
 (x - mean(x,na.rm=T)) / sd(x,na.rm=T)
}
#
# ===================================================================
# ===== Create an SD plot
# ===================================================================
#
SDplot = function(X,Y) {
 sdX = sd(X,na.rm=T) # Compute x SD
 sdY = sd(Y, na.rm=T) # Compute y SD
 muX = mean(X,na.rm=T)
 muY = mean(Y,na.rm=T)
 a = muY - sdY/sdX * muX # Get y-intercept for SD line
 #
 par(pty="s")  # Keep box square
 plot(Y ~ X, asp = (sdX/sdY),pch=16,cex=0.7,col="bisque")
 abline(a = a, b = sdY/sdX, lty=4) # Plot SD line
 abline(v = muX,lty = 3, col="grey") # Plot SDx = 0
 abline(h = muY,lty = 3, col="grey") # Plot SDy = 0
}
#
# ===================================================================
# ===== Box Cox transformation
# ===================================================================
#
BoxCox = function(x, p) {
 #
 # x: Number (or vector) to be re-expressed.
 # p: Power.
 #
 if(p == 0) {
 log(x)
 } else {
 # (x^p - 1)/p, up to a recentering and rescaling:
 if (p < 0) {
 -(x^p)
 } else {
 x^p
 }
 }
}

# ===================================================================
# ===== Transformation function
# ===================================================================
trans = function(asFields){
 # The powers to examine.
 pp = c(2, 1, 1/2, 0.33, 0, -0.33, -1/2, -1, -2)
 #
 # Output parameters
 sColor = "bisque" # Histogram bar color (RGB in hexadecimal)
 xLwd = 2 # Weight of the smoothed lines
 nBreaks = 40 # Number of cells for the histograms
 sPrefix = "transf_" # Prefix for output file names
 sMetaFormat = "png" # Metafile format specifier
 # ("wmf", "emf", "png", "jpg", "jpeg", "bmp",
 # "tif", "tiff", "ps", "eps", "pdf")
 #
 # Compute dimensions of the array of histograms.
 nRows = max(1,floor(length(pp)^(1/2)))
 nCols = ceiling(length(pp)/nRows)
 win.graph(width=1000,height=900)
 mfrowOld = par(mfrow=c(nRows, nCols)) # Saves the setting
 #
 # Create a "ladder" for each specified field.
 for (sField in asFields) {
 #
 # Extract non-missing values into the vector x.
 x = eval(as.name(sField))
 x = x[!is.na(x)]
 #
 # Create a "ladder" of histograms.
 for (p in pp) {
 z = BoxCox(x, p)
 hist(z, breaks=nBreaks, probability=TRUE, col=sColor, main=paste(sField, ": ", p))
 lines(density(z), lwd=xLwd)
 }
 #
 # Save it to a file.
 sFileName = paste(sPrefix, sField, ".", sMetaFormat, sep="")
 savePlot(filename=sFileName, type=sMetaFormat)
 }
 par(mfrow=mfrowOld)
}