clear clear matrix set matsize 800 set mem 500m cd your directior use lahman pitching drop if yearID <1900 // rename variables rename (B S) (doub trip) // repace missing hbp with individual batting data merge 1:1 yearID lgID teamID using "lahman_hbp_sf_sh_patch.dta", update drop _merge // merge 2024 season data from bb_reference merge m:m yearID teamID using bb ref 2024 team totals drop _merge // runs per game & runs allowed per game gen rpg=(R/IPouts)*27 gen rapg=(RA/IPouts)*27 // merge bbref team rfield merge m:m yearID teamID using "bb_ref_team_rfield" drop _merge replace rfield=rfield_br if rfield==. // merge UZR merge m:m yearID teamID using UZR data drop _merge /// merge smith oaa rename (teamID yearID) (teamid yearid) merge m:m yearid teamid using OAA tfr team data rename tot_fld_r tfr rename (teamid yearid) (teamID yearID) drop _merge /// merge fielding bible drs rename teamID teamid merge m:m yearID teamid using fielding bible DRS data drop _merge // merge fangraphs defense coded merge m:m yearID teamid using fg fielding data drop _merge rename defense fgfield rename teamid teamID // merge BBREF FIP merge m:m yearID teamID using bb ref FIP data drop _merge drop if year <1900 // standardize rfield bysort yearID: egen mean_rfield = mean(rfield) bysort yearID: egen sd_rfield = sd(rfield) gen z_rfield = (rfield - mean_rfield) / sd_rfield // standardize drs and fgfield uzr tfr bysort yearID: egen mean_fgfield = mean(fgfield) bysort yearID: egen sd_fgfield = sd(fgfield) gen z_fgfield = (fgfield - mean_fgfield) / sd_fgfield bysort yearID: egen mean_fb_drs = mean(fb_drs) bysort yearID: egen sd_fb_drs = sd(fb_drs) gen z_fb_drs = (fb_drs - mean_fb_drs) / sd_fb_drs bysort yearID: egen mean_uzr = mean(uzr) bysort yearID: egen sd_uzr = sd(uzr) gen z_uzr = (uzr - mean_uzr) / sd_uzr bysort yearID: egen mean_tfr = mean(tfr) bysort yearID: egen sd_tfr = sd(tfr) gen z_tfr = (tfr - mean_tfr) / sd_tfr // generate composite smith fielding score: frs, which uses TZR pre 2003, tfr after gen frs=. replace frs = rfield if year <1990 | year >1999 & year < 2003 replace frs=tfr if year >1989 & year <2000 replace frs=rfield if year >1999 & year <2003 replace frs=tfr if year >2002 /// standardize frs bysort yearID: egen mean_frs = mean(frs) bysort yearID: egen sd_frs = sd(frs) gen z_frs = (frs - mean_frs) / sd_frs // standardize FIP bysort yearID: egen mean_FIP = mean(FIP) bysort yearID: egen sd_FIP = sd(FIP) gen z_FIP = (FIP - mean_FIP) / sd_FIP // standardize runs against per game and runs scored per game bysort yearID: egen mean_rapg = mean(rapg) bysort yearID: egen sd_rapg = sd(rapg) gen z_rapg = (rapg - mean_rapg) / sd_rapg bysort yearID: egen mean_rpg = mean(rpg) bysort yearID: egen sd_rpg = sd(rpg) gen z_rpg = (rpg - mean_rpg) / sd_rpg // multi-season regression models for TZR and DRS, fgfield hireg z_rapg (z_FIP) (z_rfield) if year < 2003 hireg z_rapg (z_FIP) (z_fgfield) if year <2003 hireg z_rapg (z_FIP) (z_rfield) if year > 2002 hireg z_rapg (z_FIP) (z_fgfield) if year > 2002 regress z_rapg z_FIP if year <1990 regress z_rapg z_FIP if year >2020 regress z_rapg z_FIP z_uzr if year > 2002 regress z_rapg z_FIP fb_drs if year > 2002 regress z_rapg z_FIP z_fgfield if year > 2002 regress z_rapg z_FIP rfield if year > 2002 hireg z_rapg (z_FIP) (z_uzr) if year > 2003 hireg z_rapg (z_FIP) (z_fb_drs) if year > 2003 hireg z_rapg (z_FIP) (rfield ) if year > 2002 / individual season models ** season R^2s using BB ref gen R21 = . gen R22 = . foreach yr of numlist 1900/2024 { // Adjust the range to your data // Run the regression with z_FIP only for the current year regress RA FIP if year == `yr' // Store R2 from the first model in R21 for the current year local r2_1 = e(r2) replace R21 = `r2_1' if year == `yr' // Run the regression with z_FIP and z_rfield for the current year regress RA FIP rfield if year == `yr' // Store R2 from the second model in R22 for the current year local r2_2 = e(r2) replace R22 = `r2_2' if year == `yr' } preserve * Collapse to get mean R2 by year collapse (mean) R21 R22, by(year) * convert dataset to time series based on year sort year tsset year * * Multiply the y-axis variables by 100 for % scaling gen R21_scaled=R21*100 gen R22_scaled=R22*100 * Plot figure 1 * Generate smoothed values for R22 using lpoly with specified points lpoly R22_scaled year, gen(R22_smooth) bw(2) at(year) * Plot with gray-filled area for R22 and dashed line for R21 twoway (area R22_smooth year, color(gs12%50)) /// // Gray-filled area for smoothed R22 (lpoly R21_scaled year, lcolor(black) lpattern(longdash) lwidth(medium) bw(5)), /// // Black dashed lpoly line for smoothed R21 ytitle("R-squared (scaled x100)") /// xtitle("Year") /// ylabel(0(10)100, labsize(medium) nogrid) /// xlabel(1900(10)2023, angle(45) labsize(medium) nogrid) /// legend(off) /// graphregion(color(white)) /// plotregion(margin(zero)) restore drop R21 R22 ** season R^2s using frs drop R21 R22 gen R21 = . gen R22 = . foreach yr of numlist 1900/2024 { // // Run the regression with z_FIP only for the current year regress RA FIP if year == `yr' // Store R2 from the first model in R21 for the current year local r2_1 = e(r2) replace R21 = `r2_1' if year == `yr' // Run the regression with z_FIP and z_fgfield for the current year regress RA FIP frs if year == `yr' // Store R2 from the second model in R22 for the current year local r2_2 = e(r2) replace R22 = `r2_2' if year == `yr' di `yr' } preserve * Collapse to get mean R2 by year collapse (mean) R21 R22, by(year) * convert dataset to time series based on year sort year tsset year * * Multiply the y-axis variables by 100 for % scaling gen R21_scaled=R21*100 gen R22_scaled=R22*100 * Plot figure 1 * Generate smoothed values for R22 using lpoly with specified points lpoly R22_scaled year, gen(R22_smooth) bw(2) at(year) * Plot with gray-filled area for R22 and dashed line for R21 twoway (area R22_smooth year, color(gs12%50)) /// // Gray-filled area for smoothed R22 (lpoly R21_scaled year, lcolor(black) lpattern(longdash) lwidth(medium) bw(5)), /// // Black dashed lpoly line for smoothed R21 ytitle("R-squared (scaled x100)") /// xtitle("Year") /// ylabel(0(10)100, labsize(medium) nogrid) /// xlabel(1900(10)2023, angle(45) labsize(medium) nogrid) /// legend(off) /// graphregion(color(white)) /// plotregion(margin(zero)) restore *** marginal fip UZR gen R21 = . gen R22 = . replace rfield=uzr if year>2001 foreach yr of numlist 1900/2024 { // // Run the regression with z_FIP only for the current year regress RA FIP if year == `yr' // Store R2 from the first model in R21 for the current year local r2_1 = e(r2) replace R21 = `r2_1' if year == `yr' // Run the regression with z_FIP and z_rfield for the current year regress RA FIP rfield if year == `yr' // Store R2 from the second model in R22 for the current year local r2_2 = e(r2) replace R22 = `r2_2' if year == `yr' } preserve * Collapse to get mean R2 by year collapse (mean) R21 R22, by(year) * convert dataset to time series based on year sort year tsset year * * Multiply the y-axis variables by 100 for % scaling gen R21_scaled=R21*100 gen R22_scaled=R22*100 * Plot figure 1 * Generate smoothed values for R22 using lpoly with specified points lpoly R22_scaled year, gen(R22_smooth) bw(2) at(year) * Plot with gray-filled area for R22 and dashed line for R21 twoway (area R22_smooth year, color(gs12%50)) /// // Gray-filled area for smoothed R22 (lpoly R21_scaled year, lcolor(black) lpattern(longdash) lwidth(medium) bw(5)), /// // Black dashed lpoly line for smoothed R21 ytitle("R-squared (scaled x100)") /// xtitle("Year") /// ylabel(0(10)100, labsize(medium) nogrid) /// xlabel(1900(10)2023, angle(45) labsize(medium) nogrid) /// legend(off) /// graphregion(color(white)) /// plotregion(margin(zero)) restore replace rfield=rfield_br if year >2002 ** season R^2s using fg replace rfield=fb_drs if year >2002 drop R21 R22 ** season R^2s using BB ref pre 2003 & DRS post replace rfield=fb_drs if year >2002 gen R21 = . gen R22 = . foreach yr of numlist 1900/2024 { // // Run the regression with z_FIP only for the current year regress RA FIP if year == `yr' // Store R2 from the first model in R21 for the current year local r2_1 = e(r2) replace R21 = `r2_1' if year == `yr' // Run the regression with z_FIP and z_rfield for the current year regress RA FIP rfield if year == `yr' // Store R2 from the second model in R22 for the current year local r2_2 = e(r2) replace R22 = `r2_2' if year == `yr' } preserve * Collapse to get mean R2 by year collapse (mean) R21 R22, by(year) * convert dataset to time series based on year sort year tsset year * * Multiply the y-axis variables by 100 for % scaling gen R21_scaled=R21*100 gen R22_scaled=R22*100 * Plot figure 1 * Generate smoothed values for R22 using lpoly with specified points lpoly R22_scaled year, gen(R22_smooth) bw(2) at(year) * Plot with gray-filled area for R22 and dashed line for R21 twoway (area R22_smooth year, color(gs12%50)) /// // Gray-filled area for smoothed R22 (lpoly R21_scaled year, lcolor(black) lpattern(longdash) lwidth(medium) bw(5)), /// // Black dashed lpoly line for smoothed R21 ytitle("R-squared (scaled x100)") /// xtitle("Year") /// ylabel(0(10)100, labsize(medium) nogrid) /// xlabel(1900(10)2023, angle(45) labsize(medium) nogrid) /// legend(off) /// graphregion(color(white)) /// plotregion(margin(zero)) restore replace rfield=rfield_br if year >2002 ** season R^2s using fg replace rfield=fb_drs if year >2002 drop R21 R22 gen R21 = . gen R22 = . foreach yr of numlist 1900/2024 { // // Run the regression with z_FIP only for the current year regress RA FIP if year == `yr' // Store R2 from the first model in R21 for the current year local r2_1 = e(r2) replace R21 = `r2_1' if year == `yr' // Run the regression with z_FIP and z_fgfield for the current year regress RA FIP fgfield if year == `yr' // Store R2 from the second model in R22 for the current year local r2_2 = e(r2) replace R22 = `r2_2' if year == `yr' } preserve * Collapse to get mean R2 by year collapse (mean) R21 R22, by(year) * convert dataset to time series based on year sort year tsset year * * Multiply the y-axis variables by 100 for % scaling gen R21_scaled=R21*100 gen R22_scaled=R22*100 * Plot figure 1 * Generate smoothed values for R22 using lpoly with specified points lpoly R22_scaled year, gen(R22_smooth) bw(2) at(year) * Plot with gray-filled area for R22 and dashed line for R21 twoway (area R22_smooth year, color(gs12%50)) /// // Gray-filled area for smoothed R22 (lpoly R21_scaled year, lcolor(black) lpattern(longdash) lwidth(medium) bw(5)), /// // Black dashed lpoly line for smoothed R21 ytitle("R-squared (scaled x100)") /// xtitle("Year") /// ylabel(0(10)100, labsize(medium) nogrid) /// xlabel(1900(10)2023, angle(45) labsize(medium) nogrid) /// legend(off) /// graphregion(color(white)) /// plotregion(margin(zero)) restore replace rfield=rfield_br if year >2002 /// relative post 2002/03 R2 gen Rfip = . gen Ri_fb = . gen Ri_fg = . gen Ri_uzr= . gen Ri_tfr =. foreach yr of numlist 2003/2024 { // Adjust the range to your data // Run the regression with FIP only for the current year regress RA FIP if year == `yr' // Store R2 from the first model in R21 for the current year local r2_1 = e(r2) replace Rfip = `r2_1' if year == `yr' // Run the regression with FIP and fb for the current year regress RA FIP fb_drs if year == `yr' // Store incremental R2 from the second model local r2_2 = e(r2) replace Ri_fb = `r2_2'-Rfip if year == `yr' // Run the regression with FIP and fg for the current year regress RA FIP fgfield if year == `yr' // Store incemental R2 from the third model for the current year local r2_3 = e(r2) replace Ri_fg = `r2_3'-Rfip if year == `yr' // Run the regression with FIP and fg for the current year regress RA FIP uzr if year == `yr' // Store incemental R2 from the third model for the current year local r2_4 = e(r2) replace Ri_uzr = `r2_4'-Rfip if year == `yr' // Run the regression with FIP and fg for the current year regress RA FIP tfr if year == `yr' // Store incemental R2 from the third model for the current year local r2_5 = e(r2) replace Ri_tfr = `r2_5'-Rfip if year == `yr' } preserve collapse (mean) Rfip Ri_fb Ri_fg Ri_uzr Ri_tfr,by(yearID) drop if year <2003 * Step 2: Create cumulative variables for ribbon placement gen cum_fip = Rfip*100 gen cum_drs = (cum_fip)+Ri_fb*100 gen cum_fg = (cum_fip) + Ri_fg*100 gen cum_uzr = (cum_fip) + Ri_uzr*100 gen cum_tfr = (cum_fip) + Ri_tfr*100 * Smooth the data for each cumulative variable foreach lname in fip drs fg uzr tfr { lpoly cum_`lname' year, gen(smooth_`lname') bw(2) at(year) } * Create a variable for the lower bound gen zero = 0 * Create a variable for the lower bound gen zero = 0 twoway /// (lpolyci beta_tfr_neg year if year >1950, fcolor(dkorange%50) alcolor(dkorange%1) clstyle(none)) /// (lpoly beta_tfr_neg year if year >1950, lcolor(dkorange) lpattern(dash)) /// (lpolyci beta_fgrf_neg year if year >1950, fcolor(ltblue%50) alcolor(ltblue%1) clstyle(none)) /// (lpoly beta_fgrf_neg year if year >1950, lcolor(ltblue) lpattern(dash)) /// (lpolyci beta_rfield_neg year if year >1950, alcolor(gs12%50) clstyle(none)) /// (lpoly beta_rfield_neg year if year >1950, lcolor(black) lpattern(dash)), /// ylabel(, nogrid) xlabel(1950(10)2020, angle(45) nogrid) /// title("") xtitle("") /// legend(off) twoway /// rarea smooth_fip zero year, color(gs12%60) alcolor(gs12%01) || /// Smoothed Base Rrf rarea smooth_tfr smooth_fip year, color(yellow%80) alcolor(yellow%01) || /// Smoothed Increment K rarea smooth_drs smooth_fip year, color(blue%45) alcolor(blue%01) || /// Smoothed Increment BB rarea smooth_uzr smooth_fip year, color(red%40) alcolor(red%01) || /// Smoothed Increment BB , legend(order(1 "FIP" 2 "Increment tfr" 3 "Increment drs" 4 "Increment uzr" )) /// title(" ") /// xlabel(2003(5)2024, nogrid angle(45)format(%ty)) /// ylabel(0(10)100, nogrid angle(0)) /// ytitle("R² Value") /// xtitle("Year") restore // models for ballpark estimates of diminution of fielding importance bef after K rates // increase hireg z_rapg (z_FIP) (z_rfield) if year >1954 & year < 2000 hireg z_rapg (z_FIP) (z_rfield) if year > 2002 // illustrate shrinking value of rfield_br *** save unit value of refield for each season gen beta_rfield = . gen beta_drs = . gen beta_fgrf=. gen beta_uzr=. gen beta_frs=. gen beta_tfr=. gen beta_FIP=. foreach yr of numlist 1900/2024 { // // Run the regression with FIP and rfield only for the current year quietly regress RA FIP rfield if year == `yr' // Store the betas in the current year local b_rfield = _b[rfield] local b_FIP = _b[FIP] // Replace stored betas with betas for observations in the current year replace beta_rfield = `b_rfield' if year == `yr' replace beta_FIP = `b_FIP' if year == `yr' // Run the regression with FIP and fgfield only for the current year quietly regress RA FIP fgfield if year == `yr' // Replace beta_ for observations in the current year replace beta_fgrf = _b[fgfield] if year == `yr' } *** save unit value of frs for each season foreach yr of numlist 1920/2024 { // Run the regression with FIP and rfield only for the current year quietly regress RA FIP frs if year == `yr' // Replace beta_ for observations in the current year replace beta_frs = _b[frs] if year == `yr' } *** save unit value of tfr for each season foreach yr of numlist 1920/2024 { // Run the regression with FIP and rfield only for the current year quietly regress RA FIP tfr if year == `yr' // Replace beta_ for observations in the current year replace beta_tfr = _b[tfr] if year == `yr' } *** save unit value of uzr for each season foreach yr of numlist 2002/2024 { // Run the regression with FIP and rfield only for the current year quietly regress RA FIP uzr if year == `yr' // Replace beta_ for observations in the current year replace beta_uzr = _b[uzr] if year == `yr' } *** save unit value of drs for each season foreach yr of numlist 2003/2024 { // Run the regression with FIP and rfield only for the current year quietly regress RA FIP fb_drs if year == `yr' // Replace beta_ for observations in the current year replace beta_drs = _b[fb_drs] if year == `yr' } *** plot ** change sign of betas to reflect runs saved gen beta_rfield_neg= beta_rfield*-1 gen beta_fgrf_neg= beta_fgrf*-1 gen beta_drs_neg= beta_drs*-1 gen beta_uzr_neg= beta_uzr*-1 gen beta_frs_neg=beta_frs*-1 gen beta_tfr_neg=beta_tfr*-1 * adjust for 2021 short season replace beta_FIP=beta_FIP*2.7 if year ==2020 *** plot * Create a variable for the lower bound gen zero = 0 twoway /// rarea smooth_fip zero year, color(gs12%60) alcolor(gs12%01) || /// Smoothed Base Rrf rarea smooth_tfr smooth_fip year, color(yellow%80) alcolor(yellow%01) || /// Smoothed Increment K rarea smooth_drs smooth_fip year, color(red%55) alcolor(blue%01) || /// Smoothed Increment BB rarea smooth_uzr smooth_fip year, color(blue%40) alcolor(red%01) || /// Smoothed Increment BB , legend(order(1 "FIP" 2 "Increment tfr" 3 "Increment drs" 4 "Increment uzr" )) /// title(" ") /// xlabel(2003(5)2024, nogrid angle(45)format(%ty)) /// ylabel(0(10)100, nogrid angle(0)) /// ytitle("R² Value") /// xtitle("Year") twoway /// rarea smooth_fip zero year, color(gs12%60) alcolor(gs12%00) || /// Smoothed Base Rrf rarea smooth_drs smooth_fip year, color(yellow%70) alcolor(red%00) || /// Smoothed Increment K rarea smooth_uzr smooth_fip year, color(red%40) alcolor(blue%00) || /// Smoothed Increment BB , legend(order(1 "FIP" 2 "DRS increment" 3 "UZR increment")) /// title(" ") /// xlabel(2003(5)2024, nogrid angle(45)format(%ty)) /// ylabel(0(10)100, nogrid angle(0)) /// ytitle("R² Value") /// xtitle("Year") twoway /// (lpolyci beta_tfr_neg year if year >1950, fcolor(dkorange%50) alcolor(dkorange%1) clstyle(none)) /// (lpoly beta_tfr_neg year if year >1950, lcolor(dkorange) lpattern(dash)) /// (lpolyci beta_fgrf_neg year if year >1950, fcolor(ltblue%50) alcolor(ltblue%1) clstyle(none)) /// (lpoly beta_fgrf_neg year if year >1950, lcolor(ltblue) lpattern(dash)) /// (lpolyci beta_rfield_neg year if year >1950, alcolor(gs12%50) clstyle(none)) /// (lpoly beta_rfield_neg year if year >1950, lcolor(black) lpattern(dash)), /// ylabel(, nogrid) xlabel(1950(10)2020, angle(45) nogrid) /// title("") xtitle("") /// legend(off) /// assess rinflation on Boyer, Beltre & areanado preserve collapse (mean) beta_rfield beta_drs ,by (year) drop if year <1955 merge m:m yearID using third base rfield data // calculate adjusted rfs (taking into account need to reverse signs of betas, which are valanced toward runs allowed, not saved) gen adj_br= beta_rfield*rfield*-1 gen adj_drs=beta_drs*drs*-1 collapse (sum) (rfield drs adj_br adj_drs), by(nameLast) gen br_inflation=adj_br-rfield gen drs_inflation=adj_drs-drs export excel using "[your directory]third_base_rfield_inflation.xls", firstrow(varlabels) replace restore *** how bad was Derek Jeter? preserve collapse (mean) beta_rfield beta_drs ,by (year) drop if year <2000 merge m:m yearID using jeter_rfield data gen dj_dra=0 forval yr = 2000/2014 { replace dj_dra = djrf * beta_rfield*-1 if year == `yr' } gen dj_br_inflation=dj_dra-djrf collapse (sum) djrf dj_dra dj_br_inflation export excel using "[your directory]\jeter_rfield_inflation.xls", firstrow(varlabels) replace restore /// Illustrate impact of shfting value of fielding and pitching for 1967 pennant race ** use clarify MC to compute difference in runs avoided for Red Sox and Twins '67 vs. '24 estsimp regress RA rfield FIP if year >1962 & year == 1967 setx mean simqi, fd(ev) changex(rfield 37 -28 FIP 3.5 2.90) drop b* estsimp regress RA rfield FIP if year >1962 & year == 2024 setx mean simqi, fd(ev) changex(rfield 37 -28 FIP 3.5 2.90) drop b* // compute contributions to team runs scored associated with OPSs *** calculate OPS *** generate SLG *** gen slg=((H-(doub+trip))+2*doub+3*trip+4*HR)/AB *** generate opb *** gen obp =(H+BB+HBP)/(AB+BB+HBP+SF) *** add for OPS gen ops=obp+slg *** use clarify MC to estimate differences in runs scored '67 vs '24 estsimp regress R ops if year ==1967 setx mean simqi, fd(ev) changex(ops .716 .695) drop b* estsimp regress R ops if year ==2024 setx mean simqi, fd(ev) changex(ops .716 .695) drop b*