########################################################################################################## # Generating Gross Output and Wages by Size and Legal Form of Organization # Code used for estimates in the article, "A New Understanding of Small Businesses: Wages and Gross Output # by Industry and Legal Form of Organization Using Firm Receipts to Classify Business Size, 1998-2003" # May 2021 Survey of Current Business: https://apps.bea.gov/scb/2021/05-may/0521-small-business.htm # Created by: Richard Cao # Last Updated: October 18, 2021 ########################################################################################################## # Load all packages that will be used # If you do not have any of these packages, you will need to use the install.packages() function first. # For example, executing install.packages("sqldf") will install the sqldf package which you can then load library(readxl) library(sqldf) library(openxlsx) library(reshape2) # Import data by business size from the IRS SOI website for 1998 - 2003 # All years then get combined into one large dataset temp <- tempfile() download.file("https://www.irs.gov/pub/irs-soi/98ot2busbr.xls", temp, mode ="wb") data98 <- read_excel(temp, sheet = "All industries", skip = 3) data98 <- as.data.frame(t(data98)) data98$year <- 1998 download.file("https://www.irs.gov/pub/irs-soi/99ot2busbr.xls", temp, mode ="wb") data99 <- read_excel(temp, sheet = "All industries", skip = 3) data99 <- as.data.frame(t(data99[,-1])) data99$year <- 1999 download.file("https://www.irs.gov/pub/irs-soi/00ot2busbr.xls", temp, mode ="wb") data00 <- read_excel(temp, sheet = "All industries", skip = 3) data00 <- as.data.frame(t(data00[,-1])) data00$year <- 2000 download.file("https://www.irs.gov/pub/irs-soi/01ot2busbr.xls", temp, mode ="wb") data01 <- read_excel(temp, sheet = "All industries", skip = 3) data01 <- as.data.frame(t(data01[,-1])) data01$year <- 2001 download.file("https://www.irs.gov/pub/irs-soi/02ot2busbr.xls", temp, mode ="wb") data02 <- read_excel(temp, sheet = "All industries", skip = 3) data02 <- as.data.frame(t(data02[,-1])) data02$year <- 2002 download.file("https://www.irs.gov/pub/irs-soi/03ot2busbr.xls", temp, mode ="wb") data03 <- read_excel(temp, sheet = "All industries", skip = 3) data03 <- as.data.frame(t(data03[,-1])) data03$year <- 2003 data <- rbind(data98,data99,data00,data01,data02,data03) # This part of the code separates out the Corporations, Partnerships, and Sole Proprietorships data # It also separates out the relevant financial variables: Receipts, Cost of Goods Sold (COGS), and Wages # The Corporations, Partnerships, and Sole Proprietorships data are then combined sector <- c("--","11","21","22","23","31-33","42-45","48-49","51","52","53","54","55", "56","61","62","71","72","81","81","99") data_all <- data[-1,c(1:3,8,11:12,91)] data_all$sector <- rep(rep(sector, each = 11), 6) data_all$LFO <- "All businesses" names(data_all)[4:6] <- c("Receipts","COGS","Wages") data_corp <- data[-1,c(1:3,21,24:25,91)] data_corp$sector <- rep(rep(sector, each = 11), 6) data_corp$LFO <- "Corporations" names(data_corp)[4:6] <- c("Receipts","COGS","Wages") data_part <- data[-1,c(1:3,60,63:64,91)] data_part$sector <- rep(rep(sector, each = 11), 6) data_part$LFO <- "Partnerships" names(data_part)[4:6] <- c("Receipts","COGS","Wages") data_sp <- data[-1,c(1:3,73,76:77,91)] data_sp$sector <- rep(rep(sector, each = 11), 6) data_sp$LFO <- "Sole Props" names(data_sp)[4:6] <- c("Receipts","COGS","Wages") full_data <- rbind(data_all, data_corp, data_part, data_sp) # Assign size classes (Small, Medium, Large) # Replace missing values with zeroes # If receipts are negative, set equal to zero cleanQry <- "select year, sector, LFO, case when V1 = 'Under' then 'Vsmall_250k' when V1 = '25000' then 'Vsmall_250k' when V1 = '100000' then 'Vsmall_250k' when V1 = '250000' then 'Vsmall_500k' when V1 = '500000' then 'Vsmall_1m' when V1 = '1000000' then 'Vsmall_2m' when V1 = '2500000' then 'Small_5m' when V1 = '5000000' then 'MSmall_10m' when V1 = '10000000' then 'Medium_50m' when V1 = '50000000' then 'Large_50plus' else null end as [size], case when Wages = 'N/A' then 0 else Wages end as [Wages], case when Receipts < 0 then 0 when Receipts = 'N/A' then 0 else Receipts end as [Receipts], case when COGS = 'N/A' then 0 else COGS end as [COGS] from full_data where V1 is not null and sector not in ('--','99')" clean <- sqldf(cleanQry) # Special query to handle margin industries, where gross output is receipts less COGS finalQry <- "select year, sector, LFO, size, sum(Wages) as [WS_SOI], sum(case when sector = '42-45' then Receipts - COGS else Receipts end) as [Receipts_SOI] from clean group by year, sector, LFO, size" complete <- sqldf(finalQry) ########################################################################################################### # Import the BEA wage and gross output totals by industry and year and then proportionally # scale the SOI receipts and wages to BEA's GO and Wages totals, pulled from the BEA website # NOTE: SOI data for sector 11 excludes farms (111CA) and sector 53 excludes Owner-occupied housing for GO ########################################################################################################### # Pull data from NIPA 6.3D (Wages and Salaries by Industry) # NOTES: The vintage of BEA data is from the 2020 Annual Update # The wages total for sector 48-49 used in the article erroneously included NAICS 4911 (US Postal Service) # which is reallocated to federal enterprises in BEA's industry data # Therefore the 48-49 wage value imported below is slightly different from the calculated # values in the Survey of Current Business article. download.file("https://apps.bea.gov/histdata/Releases/GDP_and_PI/2020/Q2/Third_September-30-2020/Section6all_xls.xlsx", temp, mode ="wb") Wages <- read_excel(temp, sheet = "T60300D-A", skip = 7) Wages <- Wages[c(6,7,11,12,13,35,38,43,52,57,62,65,69,70,73,74,79,82,85),c(5:10)] Wages$sector <- c("11","21","22","23","31-33","42-45","42-45","48-49","51","52","53","54","55", "56","61","62","71","72","81") Wages <- melt(Wages, variable.name = "year", value.name = "WS", id.vars = "sector") Wages <- sqldf("select sector, year, sum(WS) as [WS] from Wages group by sector, year") # Pulls Gross Output by Industry # NOTE: The vintage used is from the 2020 Annual Update download.file("https://apps.bea.gov/histdata/Releases/Industry/2020/GDP_by_Industry/Q2/Annual_September-30-2020/UGdpByInd.zip", temp, mode ="wb") GrossOutput <- read_excel(unzip(temp, "GrossOutput.xlsx"), sheet = "UGO205-A", skip = 7) GrossOutput <- GrossOutput[c(7,8,12,15,24,72,84,95,106,120,128,131,136,146,147,154,155,166,169,172),c(5:10)] GrossOutput[12,] <- -GrossOutput[12,] USPS <- read_excel(unzip(temp, "GrossOutput.xlsx"), sheet = "UGO305-A", skip = 7) USPS <- USPS[409,c(5:10)] GrossOutput <- rbind(GrossOutput, USPS) GrossOutput$sector <- c("11","21","22","23","31-33","42-45","42-45","48-49","51","52","53","53","54","55", "56","61","62","71","72","81","48-49") GrossOutput <- melt(GrossOutput, variable.name = "year", value.name = "GO", id.vars = "sector") GrossOutput <- sqldf("select sector, year, sum(GO) as [GO] from GrossOutput group by sector, year") # Combine Wage and Gross Output Controls into one data frame bea_control <- sqldf("select n.*, g.GO from Wages n join Wages g on n.sector = g.sector and n.year = g.year") # Calculates total receipts and wages by industry from SOI dataset # This is used to calculate scaling factors sums <- sqldf("select year, sector, sum(Receipts_SOI) as [tot_receipts], sum(WS_SOI) as [tot_WS] from complete where LFO = 'All businesses' group by year, sector order by year, sector") ###################################################################### # STEP 1: Split BEA's Wages and GO by Industry to the size categories ###################################################################### # Calculate the ratios used to allocate the BEA industry controls to the different size categories using the SOI distribution ratio_qry <- "select c.*, b.GO, s.tot_receipts, b.GO/s.tot_receipts as [go_factor], b.WS, b.WS/s.tot_WS as [ws_factor] from complete c left join bea_control b on c.sector = b.sector and c.year = b.year left join sums s on c.sector = s.sector and c.year = s.year where c.LFO = 'All businesses' " ratios <- sqldf(ratio_qry) # This query uses the ratios calculated above and perform the splits. final_qry <- "select year, sector, size, sum(Receipts_SOI) as [Receipts_SOI], sum(Receipts_SOI*go_factor) as [GO], sum(WS_SOI) as [WS_SOI], sum(WS_SOI*ws_factor) as [Payroll] from ratios group by year, sector, size order by year, sector" final <- sqldf(final_qry) ####################################################################################### # STEP 2: Take the Wages and GO by size from STEP 1 and split them again by legal form ####################################################################################### sums2 <- sqldf("select year, sector, size, sum(Receipts_SOI) as [tot_receipts], sum(WS_SOI) as [tot_WS] from complete where LFO <> 'All businesses' group by year, sector, size order by year, sector") ratio_qry2 <- "select c.*, b.GO, b.GO/s.tot_receipts as [go_factor], b.Payroll, b.Payroll/s.tot_WS as [ws_factor] from complete c left join final b on c.sector = b.sector and c.year = b.year and c.size = b.size left join sums2 s on c.sector = s.sector and c.year = s.year and c.size = s.size where c.LFO <> 'All businesses' " ratios2 <- sqldf(ratio_qry2) final_qry2 <- "select year, sector, size, LFO, sum(Receipts_SOI*go_factor) as [GO], sum(WS_SOI*ws_factor) as [Payroll] from ratios2 group by year, sector, size, LFO order by year, sector" final2 <- sqldf(final_qry2) # Export the final data to Excel # The data is exported in a vertical format # You will need to pivot directly in Excel to organize in a horizontal table # The file will be saved in "C:/Program Files/RStudio/" unlesss you specify a # different file directory with the setwd() command. wb1 <- createWorkbook() addWorksheet(wb1, "Data") writeData(wb1, "Data", final) saveWorkbook(wb1, file = "GO and Wages by Size and LFO (1998 - 2003).xlsx", overwrite = TRUE)