A basic transformation and regression example using Census data
The data used in this example are 2010 ACS Census data of per capita income and fraction of population with a Bachelor’s degree or greater. The data file can be downloaded here.
# Before running the following lines, the user shoud run all
# functions found at the bottom of the page
#
# Read datafile
dat = read.csv("inc_ed.csv", header=T)
attach(dat)
# Generate a scatter plot of Y vs X where x and y scales
# are one SD unit in length. Plot the 45° SD line along
# with the mean vertical and horizontal lines
SDplot(percE_E,B19301_001E)
# Compute OLS
M1 = lm(B19301_001E ~ percE_E)
# Plot regression line
abline(M1,col="blue")
# Plot OLS results
OP = par(mfrow=c(2, 2))
plot(M1)
par(OP)
# The above assumes normally distributed data when in fact
# the data are not perfectly normal.
# Create a 'ladder' of transformations. The trans() function creates png files,
# so check the directory for image files after running trans().
trans(c("B19301_001E","percE_E"))
# Based on a visual analysis of the transformations,
# re-express the data as follows:
tY = BoxCox(B19301_001E,0.33)
tX = BoxCox(percE_E,0.5)
# Generate scatter plot of transformed data
SDplot(tX,tY)
# Rerun lm model with transformed data
M2 = lm(tY ~ tX)
# Plot regression line
abline(M2,col="blue")
# Plot lm diagnostic plots
OP = par(mfrow=c(2, 2))
plot(M2)
par(OP)
detach(dat)
######################################################################
##############
############## FUNCTIONS
##############
######################################################################
#
# ===================================================================
# ===== Standardization function
# ===================================================================
#
std = function(x,y) {
(x - mean(x,na.rm=T)) / sd(x,na.rm=T)
}
#
# ===================================================================
# ===== Create an SD plot
# ===================================================================
#
SDplot = function(X,Y) {
sdX = sd(X,na.rm=T) # Compute x SD
sdY = sd(Y, na.rm=T) # Compute y SD
muX = mean(X,na.rm=T)
muY = mean(Y,na.rm=T)
a = muY - sdY/sdX * muX # Get y-intercept for SD line
#
par(pty="s") # Keep box square
plot(Y ~ X, asp = (sdX/sdY),pch=16,cex=0.7,col="bisque")
abline(a = a, b = sdY/sdX, lty=4) # Plot SD line
abline(v = muX,lty = 3, col="grey") # Plot SDx = 0
abline(h = muY,lty = 3, col="grey") # Plot SDy = 0
}
#
# ===================================================================
# ===== Box Cox transformation
# ===================================================================
#
BoxCox = function(x, p) {
#
# x: Number (or vector) to be re-expressed.
# p: Power.
#
if(p == 0) {
log(x)
} else {
# (x^p - 1)/p, up to a recentering and rescaling:
if (p < 0) {
-(x^p)
} else {
x^p
}
}
}
# ===================================================================
# ===== Transformation function
# ===================================================================
trans = function(asFields){
# The powers to examine.
pp = c(2, 1, 1/2, 0.33, 0, -0.33, -1/2, -1, -2)
#
# Output parameters
sColor = "bisque" # Histogram bar color (RGB in hexadecimal)
xLwd = 2 # Weight of the smoothed lines
nBreaks = 40 # Number of cells for the histograms
sPrefix = "transf_" # Prefix for output file names
sMetaFormat = "png" # Metafile format specifier
# ("wmf", "emf", "png", "jpg", "jpeg", "bmp",
# "tif", "tiff", "ps", "eps", "pdf")
#
# Compute dimensions of the array of histograms.
nRows = max(1,floor(length(pp)^(1/2)))
nCols = ceiling(length(pp)/nRows)
win.graph(width=1000,height=900)
mfrowOld = par(mfrow=c(nRows, nCols)) # Saves the setting
#
# Create a "ladder" for each specified field.
for (sField in asFields) {
#
# Extract non-missing values into the vector x.
x = eval(as.name(sField))
x = x[!is.na(x)]
#
# Create a "ladder" of histograms.
for (p in pp) {
z = BoxCox(x, p)
hist(z, breaks=nBreaks, probability=TRUE, col=sColor, main=paste(sField, ": ", p))
lines(density(z), lwd=xLwd)
}
#
# Save it to a file.
sFileName = paste(sPrefix, sField, ".", sMetaFormat, sep="")
savePlot(filename=sFileName, type=sMetaFormat)
}
par(mfrow=mfrowOld)
}