# Distribution of mortgage loan amounts in 2015

IN THIS POST I WANT TO CREATE some data visualizations with R using the recently released Home Mortgage Disclosure Act (HMDA) data.

For this post I’m going to return to the 2015 HMDA that you can get from the Consumer Financial Protection Bureau (CFPB) webpage and I discussed earlier.

Check out my prior post for more discussion of how we build these data visualizations.

R code for graphs posted below

## Mortgage loan sizes in California

First, we’ll update my graph from the earlier post to show the distribution of loan sizes in California in 2015. Many loans in California are over $1 million due to high home values, particularly on the coast. The graph below looks at the share of loans with a loan amount greater than$1 million. st.list<-unique(mydata[st.fips !=“72”,]$st.fips) # The next step is to make a function that generates the composite plot based on a state FIPS number: myplot<-function(i){ c.list<-unique(mydata[st.fips ==st.list[i]]$fips) # all counties within selected state [i] smap<-subset(smap.all, id %in% st.list[i]) # subset state map cmap<-subset(cmap.all, fips %in% c.list) # subset county map

#state label st.label<-unique(fips.look[st.fips==as.numeric(st.list[i])]$state_abbr) # graph 1: map (as above, but only including subset) g1<- ggplot() + geom_map(data = cmap, map = cmap, aes(x = long, y = lat, map_id = id), color = “#2b2b2b”, size = 0.05, fill = NA) + geom_map(data = counties@data, map = cmap, aes(fill =log(upb.med), map_id = fips), color = NA) + geom_map(data = smap, map = smap, aes(x = long, y = lat, map_id = id), color = “black”, size = 1.05, fill = NA) + theme_map( base_size = 12) + theme(plot.title=element_text( size = 16, margin=margin(b=10))) + theme(plot.subtitle=element_text(size = 14, margin=margin(b=-20))) + theme(plot.caption=element_text(size = 9, margin=margin(t=-15),hjust=0)) + coord_proj(us_laea_proj) + labs(y=“Loan Amount,$”,x=“Loan Purpose”, title=paste(“Median loan amount by county in”, unique(fips.look[st.fips==as.numeric(st.list[i])]$state_abbr)))+ scale_fill_viridis(name=“Median Loan Amount\n$, log scale\n”, discrete=F,option=“D”,end=0.95,direction=-1,limits=c(log(10000),log(1.4e6)), breaks=c(log(10000),log(100000),log(1e6)), labels=c(“$10,000”,“$100,000”,“$1,000,000”) )+ theme(legend.position = “right”) #plot data: # Prepare data: select only data in the state (derived from c.list) pdata<-county.sum[fips %in% c.list] pdata2<-mydata[fips %in% c.list,.SD[sample(.N,min(.N,1000))],by = msamd_name ] #subsample metro data # See note: sample by groups # http://stackoverflow.com/questions/27325656/how-do-you-sample-groups-in-a-data-table-with-a-caveat pdata2[msamd_name==“”,msamd_name:=“Non-metro”] #rename missing metros to “Non-Metro” # pdata2[,.N,by=msamd_name] # Can run this to check how many obs we have per metro pdata3<-mydata[fips %in% c.list][sample(.N,1000)] #subsample state data # graph 2: upb distribution for entire state g2<- ggplot(data=pdata2,aes(y=“”,x=upb,color=log(upb)))+ geom_quasirandom(alpha=0.5,size=0.35)+ theme_minimal()+ scale_color_viridis(name=“Loan Amount\n$,log scale\n”,discrete=F,option=“D”,end=0.95,direction=-1, limits=c(log(10000),log(1.4e6)), breaks=c(log(10000),log(100000),log(1e6)), labels=c(“$10k”,“$100k”,“$1,000k”) ) + scale_x_log10(limits=c(10000,1.4e6),breaks=c(10000,100000,1000000), labels=c(“$10k”,“$100k”,“$1,000k”) )+ theme(plot.title=element_text(size=14))+ theme(plot.caption=element_text(hjust=0,vjust=1,margin=margin(t=10)))+ theme(plot.margin=unit(c(0.25,0.25,0.25,0.25),“cm”))+ theme(legend.position = “none”)+ labs(y=“”,x=“Loan Amount ($, log scale)”, caption=”\n@lenkiefer Source: CFPB, FFIEC, Home Mortgage Disclosure Act (HMDA) data\nIncludes all home purchase, home improvement, and refinance loans for 1-4 family dwellings originated in 2015.“, title=paste(“Loan size distribution by Metro in”, unique(fips.look[st.fips==as.numeric(st.list[i])]$state_abbr)))+ theme(axis.text.x = element_text(size=6))+ facet_wrap(~msamd_name)+theme(strip.text.x = element_text(size = 6))

# graph 3: upb distribution by county (using a subsample of 1000 obs) g3<- ggplot(data=pdata3,aes(y=“”,x=upb,color=log(upb)))+geom_quasirandom(alpha=0.5,size=0.75)+ theme_minimal()+ scale_color_viridis(name=“Loan Amount\n$,log scale\n”,discrete=F,option=“D”,end=0.95,direction=-1, limits=c(log(10000),log(1.4e6)), breaks=c(log(10000),log(100000),log(1e6)), labels=c(“$10k”,“$100k”,“$1,000k”) ) + scale_x_log10(limits=c(10000,1.4e6),breaks=c(10000,100000,1000000), labels=c(“$10k”,“$100k”,“$1,000k”) )+ theme(plot.title=element_text(size=14))+theme(plot.caption=element_text(hjust=0,vjust=1,margin=margin(t=10)))+ theme(plot.margin=unit(c(0.25,0.25,0.25,0.25),“cm”))+ theme(legend.position = “none”)+ labs(y=“”,x=“Loan Amount ($, log scale)”, title=paste(“Mortgage loan size distribution in”, unique(fips.look[st.fips==as.numeric(st.list[i])]$state_abbr)))+ facet_wrap(~state_name) m<-multiplot(g1,g2,g3,layout=matrix(c(1,3,2,2,2,2), nrow=3, byrow=TRUE)) # I’ve source the multiplot function above, and you can find it at: # http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/ } # Try it out for California: myplot(5) # compute the number of loans with loan amount greater than$1 Million

mydata[,upb.gt1m:=ifelse(upb>1000000,1,0)] ca.sums<-mydata[state_abbr==“CA”,list(.N,gt1m=sum(upb.gt1m)),by=msamd_name]

# make bar chart:

ggplot(data=ca.sums,aes(y=gt1m/N,x=reorder(msamd_name,gt1m/N)))+ theme_minimal()+theme(legend.position=“none”)+ geom_bar(stat=“identity”,aes(fill=gt1m/N))+ theme(axis.text.y = element_text(size=7))+ scale_y_continuous(label=percent)+theme(plot.caption=element_text(hjust=0))+ scale_fill_viridis(direction=-1,end=0.95)+coord_flip()+ labs(y=“Share of loans”,x=“”, title=“Share of mortgage loans with loan amount greater than \$1 Million”, subtitle=“Home purchase, home improvement and refinance loans on 1-4 family dwellings”, caption=“@lenkiefer Source: CFPB, FFIEC, Home Mortgage Disclosure Act (HMDA) data\nIncludes all home purchase, home improvement, and refinance loans for\n1-4 family dwellings originated in 2015.“)

# Try out composite plot for Texas:

myplot(48) {% endhighlight