UPDATE 10/18/2011:
Thanks to some of the comments, I was able to pare this down using R’s read.fwf() function. Here’s the new version.
# makeData_campaignFinance_v1_1.R -- copyright 10.18.2011, christopher compeau (email: my last name aht gmail dot com)
# thanks to the commentors on swordofcrom.wordpress.com for their help with read.fwf()
# use as you please but please attribute credit to christopher compeau if you publish anything
# the use of the FEC campaign finance data is subject to the rules on the FEC website
# have fun my babies. bonus points if you get yourself on some conrgessional campaign's shit list.
# this uses the 2011-2012 detailed discoloser data files at http://www.fec.gov/finance/disclosure/ftpdet.shtml
# still to be done: write tools for amended individual contributions files and other stuff as yet undiscovered.
# overpunch tool
overpunch = function(x) {
# remove leading zeroes
amount = sub("^0+","",x)
sign = rep(1,length(x))
changeChar = c(
expression(sub("\\[$","0",amount)),
expression(sub("\\]$","0",amount)),
expression(sub("[{}]$","0",amount)),
expression(sub("[AJ]$","1",amount)),
expression(sub("[BK]$","2",amount)),
expression(sub("[CL]$","3",amount)),
expression(sub("[DM]$","4",amount)),
expression(sub("[EN]$","5",amount)),
expression(sub("[FO]$","6",amount)),
expression(sub("[GP]$","7",amount)),
expression(sub("[HQ]$","8",amount)),
expression(sub("[IR]$","9",amount))
)
changes1 = grep("\\]$",amount)
changes2 = grep("[JKLMNOPQR}]$",amount)
sign[c(changes1,changes2)] = -1
for (i in 1:length(changeChar)) {
amount = eval(changeChar[i])
}
holder = as.numeric(sign) * as.numeric(amount)
return(holder)
}
# Committee Master File
writeLines(iconv(sub("\t","/t",readLines("~/Projects/campaign_finance/data/raw/committeeMaster_2011_2012.dta")),from="ASCII",to="UTF8"),"~/Projects/campaign_finance/data/preprocessed/committeeMaster_2011_2012_UTF8.dta")
cmteeMasterNames = c("cmID","cmNAME","treasurer","streetOne","streetTwo","cityTown","state","zip","cmDESIG","cmTYPE","cmPARTY","fileFreq","groupCategory","orgName","candidateID")
cmteeMaster = read.fwf("~/Projects/campaign_finance/data/preprocessed/committeeMaster_2011_2012_UTF8.dta",c(9,90,38,34,34,18,2,5,1,1,3,1,1,38,9),comment.char="",strip.white=TRUE,col.names=cmteeMasterNames)
# Candidate Master File
writeLines(iconv(sub("\t","/t",readLines("~/Projects/campaign_finance/data/raw/candidateMaster_2011_2012.dta")),from="ASCII",to="UTF8"),"~/Projects/campaign_finance/data/preprocessed/candidateMaster_2011_2012_UTF8.dta")
candMasterNames = c('cndID','cndName','partyDesig1','filler1','partyDesig3','seatStatus','filler2','candidateStatus','streetOne','streetTwo','cityTown','state','zip','principalCommID','electionYear','currentDistrict')
candMaster = read.fwf(file="~/Projects/campaign_finance/data/preprocessed/candidateMaster_2011_2012_UTF8.dta",c(9,38,3,3,3,1,1,1,34,34,18,2,5,9,2,2),comment.char="",strip.white=TRUE,col.names=candMasterNames)
# Individual Contributions
writeLines(iconv(sub("\t","/t",readLines("~/Projects/campaign_finance/data/raw/individualContributions_2011_2012.dta")),from="ASCII",to="UTF8"),"~/Projects/campaign_finance/data/preprocessed/individualContributions_2011_2012_UTF8.dta")
individualNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','contributorName','cityTown','state','zip','occupation','month','transactionDay','transactionCentury','transactionYear','amount','otherID','fecRecord')
individual = read.fwf(file="~/Projects/campaign_finance/data/preprocessed/individualContributions_2011_2012_UTF8.dta",c(9,1,3,1,11,3,34,18,2,5,35,2,2,2,2,7,9,7),comment.char="",strip.white=TRUE,col.names=individualNames)
individual$amount = overpunch(individual$amount)
# Contributions from Committees
writeLines(iconv(sub("\t","/t",readLines("~/Projects/campaign_finance/data/raw/candidatesFromCommittees_2011_2012.dta")),from="ASCII",to="UTF8"),"~/Projects/campaign_finance/data/preprocessed/candidatesFromCommittees_2011_2012_UTF8.dta")
candFromCommitteesNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','transactionMonth','transactionDay','transactionCentury','transactionYear','amount','otherID','candidateID','fecRecord')
candFromCommittees = read.fwf(file="~/Projects/campaign_finance/data/preprocessed/candidatesFromCommittees_2011_2012_UTF8.dta",c(9,1,3,1,11,3,2,2,2,2,7,9,9,7),comment.char="", strip.white=TRUE, col.names=candFromCommitteesNames)
candFromCommittees$amount = overpunch(candFromCommittees$amount)
# Transaction from committee to another
writeLines(iconv(sub("\t","/t",readLines("~/Projects/campaign_finance/data/raw/committeeToCommittee_2011_2012.dta")),from="ASCII",to="UTF8"),"~/Projects/campaign_finance/data/preprocessed/committeeToCommittee_2011_2012_UTF8.dta")
commToCommNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','contributorName','cityTown','state','zip','occupation','month','transactionDay','transactionCentury','transactionYear','amount','otherID','fecRecord')
commToComm = read.fwf(file="~/Projects/campaign_finance/data/preprocessed/committeeToCommittee_2011_2012_UTF8.dta",c(9,1,3,1,11,3,34,18,2,5,35,2,2,2,2,7,9,7),comment.char="", strip.white=TRUE, col.names=commToCommNames)
commToComm$amount = overpunch(commToComm$amount)
ORIGINAL POST 10/17/2011:
For my first contribution to the blog, I wanted to make some kind of enlightening visualization of campaign finance disclosure data from the Federal Election Commission’s website. It looks like they’re working on some new, easy-to-use data dumps here, but I decided to try to use the more detailed data files here because I couldn’t really tell the difference between the two data pages, and as a rule I always of for the most granular unaggregated data when I have a choice.
Anyway, the FEC dumps the data in some weird fixed-width COBOL format that kept me from using any of the read.delim functions to get the data into R, so I had to write a bunch of little parsing functions for each data file. I spent all day yesterday on these little helpers and I haven’t yet had the opportunity to do anything interesting with the data, so I decided that I would just post the code and work on some visualizations later this week.
So in summary, this code makes each of the FEC data dump file into R data frames:
- Committee Master File: cmteeMaster
- Candidate Master File: candMaster
- Individual Contributions: individuals
- Contributions to Candidates from Committees: candFromCommittees
- Transactions between Committees: commToComm
# makeData_campaignFinance_v1_0.R -- copyright 10.17.2011, christopher compeau (email: my last name aht gmail dot com)
# use as you please but please attribute credit to christopher compeau if you publish anything
# the use of the FEC campaign finance data is subject to the rules on the FEC website
# have fun my babies. bonus points if you get yourself on some conrgessional campaign's shit list.
# this uses the 2011-2012 detailed discoloser data files at http://www.fec.gov/finance/disclosure/ftpdet.shtml
# still to be done: write tools for amended individual contributions files and other stuff as yet undiscovered.
# RAW DATA FILE PARSING TOOLS
trim.trailing <- function (x) {sub("\\s+$", "", x)}
# committee master file
cmMaster = function(line) {
cmID = substr(line,1,9)
cmNAME = substr(line,10,99)
treasurer = substr(line,100,137)
streetOne = substr(line,138,171)
streetTwo = substr(line,172,205)
cityTown = substr(line,206,223)
state = substr(line,224,225)
zip = substr(line,226,230)
cmDESIG = substr(line,231,231)
cmTYPE = substr(line,232,232)
cmPARTY = substr(line,233,235)
fileFreq = substr(line,236,236)
groupCategory = substr(line,237,237)
orgName = substr(line,238,275)
candidateID = substr(line,276,284)
record = c(cmID,cmNAME,treasurer,streetOne,streetTwo,cityTown,state,zip,cmDESIG,cmTYPE,cmPARTY,fileFreq,groupCategory,orgName,candidateID)
for (i in 1:length(record)) {
record[i] = trim.trailing(record[i])
}
return(record)
}
# candidate master file
candMaster = function(line) {
cndID = substr(line,1,9)
cndName = substr(line,10,47)
partyDesig1 = substr(line,48,50)
filler1 = substr(line,51,53)
partyDesig3 = substr(line,54,56)
seatStatus = substr(line,57,57)
filler2 = substr(line,58,58)
candidateStatus = substr(line,59,59)
streetOne = substr(line,60,93)
streetTwo = substr(line,94,127)
cityTown = substr(line,128,145)
state = substr(line,146,147)
zip = substr(line,148,152)
principalCommID = substr(line,153,161)
electionYear = substr(line,162,163)
currentDistrict = substr(line,164,165)
record = c(cndID,cndName,partyDesig1,filler1,seatStatus,filler2,candidateStatus,streetOne,streetTwo,cityTown,state,zip,principalCommID,electionYear,currentDistrict)
for (i in 1:length(record)) {
record[i] = trim.trailing(record[i])
}
return(record)
}
# indivudual candidate contributions, committee to committe transactions
indAndComContribution = function(line) {
filerID = substr(line,1,9)
amendIndicator = substr(line,10,10)
reportType = substr(line,11,13)
primaryGeneral = substr(line,14,14)
microfilmLocation = substr(line,15,25)
transactionType = substr(line,26,28)
contributorName = substr(line,29,62)
cityTown = substr(line,63,80)
state = substr(line,81,82)
zip = substr(line,83,87)
occupation = substr(line,88,122)
month = substr(line,123,124)
transactionDay = substr(line,125,126)
transactionCentury = substr(line,127,128)
transactionYear = substr(line,129,130)
amount = substr(line,131,137)
otherID = substr(line,138,146)
fecRecord = substr(line,147,153)
record = c(filerID,amendIndicator,reportType,primaryGeneral,microfilmLocation,transactionType,contributorName,cityTown,state,zip,occupation,month,transactionDay,transactionCentury,transactionYear,amount,otherID,fecRecord)
for (i in 1:length(record)) {
record[i] = trim.trailing(record[i])
}
return(record)
}
# contributions to candidate from committees
candComContibution = function(line) {
filerID = substr(line,1,9)
amendIndicator = substr(line,10,10)
reportType = substr(line,11,13)
primaryGeneral = substr(line,14,14)
microfilmLocation = substr(line,15,25)
transactionType = substr(line,26,28)
transactionMonth = substr(line,29,30)
transactionDay = substr(line,31,32)
transactionCentury = substr(line,33,34)
transactionYear = substr(line,35,36)
amount = substr(line,37,43)
otherID = substr(line,44,52)
candidateID = substr(line,53,61)
fecRecord = substr(line,62,68)
record = c(filerID,amendIndicator,reportType,primaryGeneral,microfilmLocation,transactionType,transactionMonth,transactionDay,transactionCentury,transactionYear,amount,otherID,candidateID,fecRecord)
for (i in 1:length(record)) {
record[i] = trim.trailing(record[i])
}
return(record)
}
# overpunch tool
overpunch = function(x) {
# remove leading zeroes
amount = sub("^0+","",x)
sign = rep(1,length(x))
changeChar = c(
expression(sub("\\[$","0",amount)),
expression(sub("\\]$","0",amount)),
expression(sub("[{}]$","0",amount)),
expression(sub("[AJ]$","1",amount)),
expression(sub("[BK]$","2",amount)),
expression(sub("[CL]$","3",amount)),
expression(sub("[DM]$","4",amount)),
expression(sub("[EN]$","5",amount)),
expression(sub("[FO]$","6",amount)),
expression(sub("[GP]$","7",amount)),
expression(sub("[HQ]$","8",amount)),
expression(sub("[IR]$","9",amount))
)
changes1 = grep("\\]$",amount)
changes2 = grep("[JKLMNOPQR}]$",amount)
sign[c(changes1,changes2)] = -1
for (i in 1:length(changeChar)) {
amount = eval(changeChar[i])
}
holder = as.numeric(sign) * as.numeric(amount)
return(holder)
}
# function using parsing tools to make data frames
# 'expsn' is an unevaluated expression for each parsing tool
# some raw data records are not the length stated in data docs
mkDataFrame = function(data,lineLength,columnNames,expsn) {
properData = data[nchar(data, allowNA=TRUE)==lineLength]
nRecords = length(properData)
finalMatrix = matrix(nrow=length(properData),ncol=length(columnNames))
for (i in 1:nRecords) {
result = eval(expsn)
finalMatrix[i,] = result
}
finalDF = as.data.frame(finalMatrix)
names(finalDF) = columnNames
return(finalDF)
}
# Now use parsing tools to read data into dataframes
# Committee Master File
cmteeMasterRaw = read.delim(file="~/Projects/campaign_finance/data/committeeMaster_2011_2012.dta", header=FALSE, sep="\n")
cmteeMasterRaw = as.character(cmteeMasterRaw[,1])
cmteeMasterNames = c("cmID","cmNAME","treasurer","streetOne","streetTwo","cityTown","state","zip","cmDESIG","cmTYPE","cmPARTY","fileFreq","groupCategory","orgName","candidateID")
cmteeMaster = mkDataFrame(cmteeMasterRaw,284,cmteeMasterNames,expression(cmMaster(properData[i])))
# Candidate Master File
candMasterRaw = read.delim(file="~/Projects/campaign_finance/data/candidateMaster_2011_2012.dta", header=FALSE, sep="\n")
candMasterRaw = as.character(candMasterRaw[,1])
candMasterNames = c('cndID','cndName','partyDesig1','filler1','seatStatus','filler2','candidateStatus','streetOne','streetTwo','cityTown','state','zip','principalCommID','electionYear','currentDistrict')
candMaster = mkDataFrame(candMasterRaw,165,candMasterNames,expression(candMaster(properData[i])))
# Individual Contributions
individualRaw = read.delim(file="~/Projects/campaign_finance/data/individualContributions_2011_2012.dta", header=FALSE,sep="\n")
individualRaw = as.character(individualRaw[,1])
individualNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','contributorName','cityTown','state','zip','occupation','month','transactionDay','transactionCentury','transactionYear','amount','otherID','fecRecord')
individuals = mkDataFrame(individualRaw,153,individualNames,expression(indAndComContribution(properData[i])))
individuals$amount = overpunch(individuals$amount)
# Contributions from Committees
candFromCommitteesRaw = read.delim(file="~/Projects/campaign_finance/data/candidatesFromCommittees_2011_2012.dta", header=FALSE, sep="\n")
candFromCommitteesRaw = as.character(candFromCommitteesRaw[,1])
candFromCommitteesNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','transactionMonth','transactionDay','transactionCentury','transactionYear','amount','otherID','candidateID','fecRecord')
candFromCommittees = mkDataFrame(candFromCommitteesRaw,68,candFromCommitteesNames,expression(candComContibution(properData[i])))
candFromCommittees$amount = overpunch(candFromCommittees$amount)
# Transaction from committee to another
commToCommRaw = read.delim(file="~/Projects/campaign_finance/data/comitteeToCommittee_2011_2012.dta", header=FALSE, sep="\n")
commToCommRaw = as.character(commToCommRaw[,1])
commToCommNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','contributorName','cityTown','state','zip','occupation','month','transactionDay','transactionCentury','transactionYear','amount','otherID','fecRecord')
commToComm = mkDataFrame(commToCommRaw,153,commToCommNames,expression(indAndComContribution(properData[i])))
commToComm$amount = overpunch(commToComm$amount)