Show me your WAR face!

Below is a chart of the top 20 offensive players based on FanGraphs WAR for the 2011 season.  The various features and their corresponding metric are clear in the image. I’ve also included the leader and last place for each metric to get an idea of what the extremes would look like as it’s all normalized.  For example Jose Reyes’s 7 Home Runs this season gives him a very narrow face as compared to Jose Bautista’s double wide.  (This is highly derivative and I’m painfully aware of this but I really wanted to play with the Chernoff faces function available in the aplpack R library. )

Advertisements

R Tools for FEC Campaign Finance Disclosure Data

UPDATE 10/18/2011:

Thanks to some of the comments, I was able to pare this down using R’s read.fwf() function. Here’s the new version.


# makeData_campaignFinance_v1_1.R -- copyright 10.18.2011, christopher compeau (email: my last name aht gmail dot com)
# thanks to the commentors on swordofcrom.wordpress.com for their help with read.fwf()

# use as you please but please attribute credit to christopher compeau if you publish anything
# the use of the FEC campaign finance data is subject to the rules on the FEC website
# have fun my babies. bonus points if you get yourself on some conrgessional campaign's shit list.

# this uses the 2011-2012 detailed discoloser data files at http://www.fec.gov/finance/disclosure/ftpdet.shtml
# still to be done: write tools for amended individual contributions files and other stuff as yet undiscovered.

# overpunch tool
overpunch = function(x) {
  # remove leading zeroes
  amount = sub("^0+","",x)
  sign = rep(1,length(x))
  changeChar = c(
    expression(sub("\\[$","0",amount)),
    expression(sub("\\]$","0",amount)),
    expression(sub("[{}]$","0",amount)),
    expression(sub("[AJ]$","1",amount)),
    expression(sub("[BK]$","2",amount)),
    expression(sub("[CL]$","3",amount)),
    expression(sub("[DM]$","4",amount)),
    expression(sub("[EN]$","5",amount)),
    expression(sub("[FO]$","6",amount)),
    expression(sub("[GP]$","7",amount)),
    expression(sub("[HQ]$","8",amount)),
    expression(sub("[IR]$","9",amount))
    )
  changes1 = grep("\\]$",amount)
  changes2 = grep("[JKLMNOPQR}]$",amount)
  sign[c(changes1,changes2)] = -1
  for (i in 1:length(changeChar)) {
    amount = eval(changeChar[i])
  }
  holder = as.numeric(sign) * as.numeric(amount)
  return(holder)  
}

# Committee Master File
writeLines(iconv(sub("\t","/t",readLines("~/Projects/campaign_finance/data/raw/committeeMaster_2011_2012.dta")),from="ASCII",to="UTF8"),"~/Projects/campaign_finance/data/preprocessed/committeeMaster_2011_2012_UTF8.dta")
cmteeMasterNames = c("cmID","cmNAME","treasurer","streetOne","streetTwo","cityTown","state","zip","cmDESIG","cmTYPE","cmPARTY","fileFreq","groupCategory","orgName","candidateID")
cmteeMaster = read.fwf("~/Projects/campaign_finance/data/preprocessed/committeeMaster_2011_2012_UTF8.dta",c(9,90,38,34,34,18,2,5,1,1,3,1,1,38,9),comment.char="",strip.white=TRUE,col.names=cmteeMasterNames)
  
# Candidate Master File
writeLines(iconv(sub("\t","/t",readLines("~/Projects/campaign_finance/data/raw/candidateMaster_2011_2012.dta")),from="ASCII",to="UTF8"),"~/Projects/campaign_finance/data/preprocessed/candidateMaster_2011_2012_UTF8.dta")
candMasterNames = c('cndID','cndName','partyDesig1','filler1','partyDesig3','seatStatus','filler2','candidateStatus','streetOne','streetTwo','cityTown','state','zip','principalCommID','electionYear','currentDistrict')
candMaster = read.fwf(file="~/Projects/campaign_finance/data/preprocessed/candidateMaster_2011_2012_UTF8.dta",c(9,38,3,3,3,1,1,1,34,34,18,2,5,9,2,2),comment.char="",strip.white=TRUE,col.names=candMasterNames)
  
# Individual Contributions
writeLines(iconv(sub("\t","/t",readLines("~/Projects/campaign_finance/data/raw/individualContributions_2011_2012.dta")),from="ASCII",to="UTF8"),"~/Projects/campaign_finance/data/preprocessed/individualContributions_2011_2012_UTF8.dta")
individualNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','contributorName','cityTown','state','zip','occupation','month','transactionDay','transactionCentury','transactionYear','amount','otherID','fecRecord')
individual = read.fwf(file="~/Projects/campaign_finance/data/preprocessed/individualContributions_2011_2012_UTF8.dta",c(9,1,3,1,11,3,34,18,2,5,35,2,2,2,2,7,9,7),comment.char="",strip.white=TRUE,col.names=individualNames)
individual$amount = overpunch(individual$amount)
  
# Contributions from Committees
writeLines(iconv(sub("\t","/t",readLines("~/Projects/campaign_finance/data/raw/candidatesFromCommittees_2011_2012.dta")),from="ASCII",to="UTF8"),"~/Projects/campaign_finance/data/preprocessed/candidatesFromCommittees_2011_2012_UTF8.dta")
candFromCommitteesNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','transactionMonth','transactionDay','transactionCentury','transactionYear','amount','otherID','candidateID','fecRecord')
candFromCommittees = read.fwf(file="~/Projects/campaign_finance/data/preprocessed/candidatesFromCommittees_2011_2012_UTF8.dta",c(9,1,3,1,11,3,2,2,2,2,7,9,9,7),comment.char="", strip.white=TRUE, col.names=candFromCommitteesNames)
candFromCommittees$amount = overpunch(candFromCommittees$amount)

# Transaction from committee to another
writeLines(iconv(sub("\t","/t",readLines("~/Projects/campaign_finance/data/raw/committeeToCommittee_2011_2012.dta")),from="ASCII",to="UTF8"),"~/Projects/campaign_finance/data/preprocessed/committeeToCommittee_2011_2012_UTF8.dta")
commToCommNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','contributorName','cityTown','state','zip','occupation','month','transactionDay','transactionCentury','transactionYear','amount','otherID','fecRecord')
commToComm = read.fwf(file="~/Projects/campaign_finance/data/preprocessed/committeeToCommittee_2011_2012_UTF8.dta",c(9,1,3,1,11,3,34,18,2,5,35,2,2,2,2,7,9,7),comment.char="", strip.white=TRUE, col.names=commToCommNames)
commToComm$amount = overpunch(commToComm$amount)

ORIGINAL POST 10/17/2011:

For my first contribution to the blog, I wanted to make some kind of enlightening visualization of campaign finance disclosure data from the Federal Election Commission’s website. It looks like they’re working on some new, easy-to-use data dumps here, but I decided to try to use the more detailed data files here because I couldn’t really tell the difference between the two data pages, and as a rule I always of for the most granular unaggregated data when I have a choice.

Anyway, the FEC dumps the data in some weird fixed-width COBOL format that kept me from using any of the read.delim functions to get the data into R, so I had to write a bunch of little parsing functions for each data file. I spent all day yesterday on these little helpers and I haven’t yet had the opportunity to do anything interesting with the data, so I decided that I would just post the code and work on some visualizations later this week.

So in summary, this code makes each of the FEC data dump file into R data frames:

  • Committee Master File: cmteeMaster
  • Candidate Master File: candMaster
  • Individual Contributions: individuals
  • Contributions to Candidates from Committees: candFromCommittees
  • Transactions between Committees: commToComm
This data is DIRTY, and it still needs a lot of work… this code just gets it into data frames. More to come.

# makeData_campaignFinance_v1_0.R -- copyright 10.17.2011, christopher compeau (email: my last name aht gmail dot com)

# use as you please but please attribute credit to christopher compeau if you publish anything
# the use of the FEC campaign finance data is subject to the rules on the FEC website
# have fun my babies. bonus points if you get yourself on some conrgessional campaign's shit list.

# this uses the 2011-2012 detailed discoloser data files at http://www.fec.gov/finance/disclosure/ftpdet.shtml
# still to be done: write tools for amended individual contributions files and other stuff as yet undiscovered.  

# RAW DATA FILE PARSING TOOLS

trim.trailing <- function (x) {sub("\\s+$", "", x)}

# committee master file
cmMaster = function(line) {
  cmID = substr(line,1,9)
  cmNAME = substr(line,10,99)
  treasurer = substr(line,100,137)
  streetOne = substr(line,138,171)
  streetTwo = substr(line,172,205)
  cityTown = substr(line,206,223)
  state = substr(line,224,225)
  zip = substr(line,226,230)
  cmDESIG = substr(line,231,231)
  cmTYPE = substr(line,232,232)
  cmPARTY = substr(line,233,235)
  fileFreq = substr(line,236,236)
  groupCategory = substr(line,237,237)
  orgName = substr(line,238,275)
  candidateID = substr(line,276,284)
  record = c(cmID,cmNAME,treasurer,streetOne,streetTwo,cityTown,state,zip,cmDESIG,cmTYPE,cmPARTY,fileFreq,groupCategory,orgName,candidateID)
  for (i in 1:length(record)) {
    record[i] = trim.trailing(record[i])
  }
  return(record)
}


# candidate master file
candMaster = function(line) {
  cndID = substr(line,1,9) 
  cndName = substr(line,10,47)
  partyDesig1 = substr(line,48,50)
  filler1 = substr(line,51,53)
  partyDesig3 = substr(line,54,56)
  seatStatus = substr(line,57,57)
  filler2 = substr(line,58,58)
  candidateStatus = substr(line,59,59)
  streetOne = substr(line,60,93)
  streetTwo = substr(line,94,127)
  cityTown = substr(line,128,145)
  state = substr(line,146,147)
  zip = substr(line,148,152)
  principalCommID = substr(line,153,161)
  electionYear = substr(line,162,163)
  currentDistrict = substr(line,164,165)
  record = c(cndID,cndName,partyDesig1,filler1,seatStatus,filler2,candidateStatus,streetOne,streetTwo,cityTown,state,zip,principalCommID,electionYear,currentDistrict)
  for (i in 1:length(record)) {
    record[i] = trim.trailing(record[i])
  }
  return(record)
}

# indivudual candidate contributions, committee to committe transactions
indAndComContribution = function(line) {
  filerID = substr(line,1,9)
  amendIndicator = substr(line,10,10)
  reportType = substr(line,11,13)
  primaryGeneral = substr(line,14,14)
  microfilmLocation = substr(line,15,25)
  transactionType = substr(line,26,28)  
  contributorName = substr(line,29,62)
  cityTown = substr(line,63,80)
  state = substr(line,81,82)
  zip = substr(line,83,87)
  occupation = substr(line,88,122)
  month = substr(line,123,124)
  transactionDay = substr(line,125,126)
  transactionCentury = substr(line,127,128)
  transactionYear = substr(line,129,130)
  amount = substr(line,131,137)
  otherID = substr(line,138,146)
  fecRecord = substr(line,147,153)
  record = c(filerID,amendIndicator,reportType,primaryGeneral,microfilmLocation,transactionType,contributorName,cityTown,state,zip,occupation,month,transactionDay,transactionCentury,transactionYear,amount,otherID,fecRecord)
  for (i in 1:length(record)) {
    record[i] = trim.trailing(record[i])
  }
  return(record)
}

# contributions to candidate from committees
candComContibution = function(line) {
  filerID = substr(line,1,9)
  amendIndicator = substr(line,10,10)
  reportType = substr(line,11,13)
  primaryGeneral = substr(line,14,14)
  microfilmLocation = substr(line,15,25)
  transactionType = substr(line,26,28)
  transactionMonth = substr(line,29,30)
  transactionDay = substr(line,31,32)
  transactionCentury = substr(line,33,34)
  transactionYear = substr(line,35,36)
  amount = substr(line,37,43)
  otherID = substr(line,44,52)
  candidateID = substr(line,53,61)
  fecRecord = substr(line,62,68)
  record = c(filerID,amendIndicator,reportType,primaryGeneral,microfilmLocation,transactionType,transactionMonth,transactionDay,transactionCentury,transactionYear,amount,otherID,candidateID,fecRecord)
  for (i in 1:length(record)) {
    record[i] = trim.trailing(record[i])
  }
  return(record)
}


# overpunch tool
overpunch = function(x) {
  # remove leading zeroes
  amount = sub("^0+","",x)
  sign = rep(1,length(x))
  changeChar = c(
    expression(sub("\\[$","0",amount)),
    expression(sub("\\]$","0",amount)),
    expression(sub("[{}]$","0",amount)),
    expression(sub("[AJ]$","1",amount)),
    expression(sub("[BK]$","2",amount)),
    expression(sub("[CL]$","3",amount)),
    expression(sub("[DM]$","4",amount)),
    expression(sub("[EN]$","5",amount)),
    expression(sub("[FO]$","6",amount)),
    expression(sub("[GP]$","7",amount)),
    expression(sub("[HQ]$","8",amount)),
    expression(sub("[IR]$","9",amount))
    )
  changes1 = grep("\\]$",amount)
  changes2 = grep("[JKLMNOPQR}]$",amount)
  sign[c(changes1,changes2)] = -1
  for (i in 1:length(changeChar)) {
    amount = eval(changeChar[i])
  }
  holder = as.numeric(sign) * as.numeric(amount)
  return(holder)  
}

# function using parsing tools to make data frames
# 'expsn' is an unevaluated expression for each parsing tool
# some raw data records are not the length stated in data docs
mkDataFrame = function(data,lineLength,columnNames,expsn) {
  properData = data[nchar(data, allowNA=TRUE)==lineLength]
  nRecords = length(properData)
  finalMatrix = matrix(nrow=length(properData),ncol=length(columnNames))
  for (i in 1:nRecords) { 
    result = eval(expsn)                   
    finalMatrix[i,] = result
  }
  finalDF = as.data.frame(finalMatrix)
  names(finalDF) = columnNames
  return(finalDF)
}

# Now use parsing tools to read data into dataframes    
    
# Committee Master File
cmteeMasterRaw = read.delim(file="~/Projects/campaign_finance/data/committeeMaster_2011_2012.dta", header=FALSE, sep="\n")
cmteeMasterRaw = as.character(cmteeMasterRaw[,1])
cmteeMasterNames = c("cmID","cmNAME","treasurer","streetOne","streetTwo","cityTown","state","zip","cmDESIG","cmTYPE","cmPARTY","fileFreq","groupCategory","orgName","candidateID")
cmteeMaster = mkDataFrame(cmteeMasterRaw,284,cmteeMasterNames,expression(cmMaster(properData[i])))  
  
# Candidate Master File
candMasterRaw = read.delim(file="~/Projects/campaign_finance/data/candidateMaster_2011_2012.dta", header=FALSE, sep="\n")
candMasterRaw = as.character(candMasterRaw[,1])
candMasterNames = c('cndID','cndName','partyDesig1','filler1','seatStatus','filler2','candidateStatus','streetOne','streetTwo','cityTown','state','zip','principalCommID','electionYear','currentDistrict')
candMaster = mkDataFrame(candMasterRaw,165,candMasterNames,expression(candMaster(properData[i])))  
  
# Individual Contributions
individualRaw = read.delim(file="~/Projects/campaign_finance/data/individualContributions_2011_2012.dta", header=FALSE,sep="\n")
individualRaw = as.character(individualRaw[,1])
individualNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','contributorName','cityTown','state','zip','occupation','month','transactionDay','transactionCentury','transactionYear','amount','otherID','fecRecord')
individuals = mkDataFrame(individualRaw,153,individualNames,expression(indAndComContribution(properData[i])))
individuals$amount = overpunch(individuals$amount)
  
# Contributions from Committees
candFromCommitteesRaw = read.delim(file="~/Projects/campaign_finance/data/candidatesFromCommittees_2011_2012.dta", header=FALSE, sep="\n")
candFromCommitteesRaw = as.character(candFromCommitteesRaw[,1])
candFromCommitteesNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','transactionMonth','transactionDay','transactionCentury','transactionYear','amount','otherID','candidateID','fecRecord')
candFromCommittees = mkDataFrame(candFromCommitteesRaw,68,candFromCommitteesNames,expression(candComContibution(properData[i])))
candFromCommittees$amount = overpunch(candFromCommittees$amount)

# Transaction from committee to another
commToCommRaw = read.delim(file="~/Projects/campaign_finance/data/comitteeToCommittee_2011_2012.dta", header=FALSE, sep="\n")
commToCommRaw = as.character(commToCommRaw[,1])
commToCommNames = c('filerID','amendIndicator','reportType','primaryGeneral','microfilmLocation','transactionType','contributorName','cityTown','state','zip','occupation','month','transactionDay','transactionCentury','transactionYear','amount','otherID','fecRecord')
commToComm = mkDataFrame(commToCommRaw,153,commToCommNames,expression(indAndComContribution(properData[i])))
commToComm$amount = overpunch(commToComm$amount)