clear clear matrix set matsize 800 set mem 500m cd "[your directory]" use Lanham_team_hitting_pitching *** merge Lahman HBP/sf/sh patch merge 1:1 yearID teamID lgID using "lahman_hbp_sf_sh_patch.dta", update drop _merge //drop non AL/NL drop if lgID != "AL" & lgID != "NL" // drop pre-modern drop if yearID <1900 //rename hit variables rename B doub rename S trip // add 2024 bb_ref_team_patch merge m:m yearID teamID using "bb_ref_2024_patch.dta" drop _merge // generate singles gen sing = H-(doub+trip) // runs per game gen rpg=(R/IPouts)*27 *** standardize runs per game by seasons bysort yearID: egen mean_rpg = mean(rpg) bysort yearID: egen sd_rpg = sd(rpg) gen z_rpg = (rpg - mean_rpg) / sd_rpg *** generate SLG *** gen slg=(sing+2*doub+3*trip+4*HR)/AB *** generate opb *** gen obp =(H+BB+HBP)/(AB+BB+HBP+SF) *** generate ops *** gen ops=obp+slg *** generate TLD wOBA *** gen wOBA=(HR*1.7+1.37*trip+1.08*doub+.77*sing+.62*BB+.65*HBP)/(AB+H+BB+HBP+SF) /// standardize ops woba by season bysort yearID: egen mean_ops = mean(ops) bysort yearID: egen sd_ops = sd(ops) gen z_ops = (ops - mean_ops) / sd_ops bysort yearID: egen mean_wOBA = mean(wOBA) bysort yearID: egen sd_wOBA = sd(wOBA) gen z_woba = (wOBA - mean_wOBA) / sd_wOBA //merge BBREF rbat rename teamID teamid merge m:m yearID teamid using "bb_ref_team_rbat_coded.dta" drop _merge *** standardize rbat by season bysort yearID: egen mean_rbat = mean(rbat) bysort yearID: egen sd_rbat = sd(rbat) gen z_rbat = (rbat - mean_rbat) / sd_rbat //// create swoba --regression derived wOBA * Generate variables to store the coefficients generate b_hrPA = . generate b_sPA = . generate b_dPA = . generate b_tPA = . generate b_bbPA = . generate b_hbpPA = . generate b_cons = . *** generate regression variables for regression-derived wOBA: swoba gen sPA =sing/(AB+H+BB+HBP+SF) gen dPA= doub/(AB+H+BB+HBP+SF) gen tPA= trip/(AB+H+BB+HBP+SF) gen hrPA=HR/(AB+H+BB+HBP+SF) gen bbPA=BB/(AB+H+BB+HBP+SF) gen hbpPA=HBP/(AB+H+BB+HBP+SF) * Generate variable to store regression-derived woba generate swoba = . * Loop over the years from 1900 to 2024 forvalues y = 1900/2024 { * Run the regression for the current yearID regress rpg hrPA sPA dPA tPA bbPA hbpPA if yearID == `y' * Store the coefficients in the dataset for the current yearID replace b_hrPA = _b[hrPA] if yearID == `y' replace b_sPA = _b[sPA] if yearID == `y' replace b_dPA = _b[dPA] if yearID == `y' replace b_tPA = _b[tPA] if yearID == `y' replace b_bbPA = _b[bbPA] if yearID == `y' replace b_hbpPA = _b[hbpPA] if yearID == `y' replace b_cons = _b[_cons] if yearID == `y' * calculate team swoba values tempvar temp_pred predict `temp_pred' if yearID == `y', xb replace swoba = `temp_pred' if yearID == `y' } **** save swoba coefficients for predictive testing preserve keep yearID b_hrPA b_sPA b_dPA b_tPA b_bbPA b_hbpPA b_cons save "swoba_coefficients.dta", replace restore * standardizing swoba bysort yearID: egen mean_swoba = mean(swoba) bysort yearID: egen sd_swoba = sd(swoba) gen z_swoba = (swoba - mean_swoba) / sd_swoba *** create fangraphs woba: fwoba merge m:m yearID using "fg_woba_wts.dta" drop _merge gen fwoba=(hrPA*fwhr+tPA*fw3b+dPA*fw2b+sPA*fw1b+bbPA*fwbb+hbpPA*fwhbp) bysort yearID: egen mean_fwoba = mean(fwoba) bysort yearID: egen sd_fwoba = sd(fwoba) gen z_fwoba = (fwoba - mean_fwoba) / sd_fwoba /// perform regressions of runs per game explained regress z_rpg z_swoba regress z_rpg z_ops regress z_rpg z_woba regress z_rpg z_fwoba regress z_rpg z_rbat su rpg ops swoba,d twoway (scatter z_rpg z_ops, mcolor(ltblue)) /// (lfit z_rpg z_ops, lcolor(black)) /// , xlabel(-2.5 ".580" -1.5 ".640" -.5 ".700" .5 "760" 1.5 ".820" 2.5 ".880", nogrid ) /// ylabel( -2.8 "3" -1.3 "4" .2 "5" 1.7 "6" 3.2 "7", nogrid) /// xtitle("z_ops") /// graphregion(color(white)) /// plotregion(style(none)) /// xscale(r(-3 3 )) /// legend(off) twoway (scatter z_rpg z_swoba, mcolor(ltblue)) /// (lfit z_rpg z_swoba, lcolor(black)) /// , xlabel(-2.3 "3" -.6 "4" 1 "5" 2.6 "6" 4.2 "7", nogrid ) /// ylabel( -2.8 "3" -1.3 "4" .2 "5" 1.7 "6" 3.2 "7", nogrid) /// xtitle("swoba") /// graphregion(color(white)) /// plotregion(style(none)) /// xscale(r(-3.8 3.2)) /// legend(off) gen Rfg = . gen Rwr = . gen Rops = . gen Rbrb =. foreach yr of numlist 1900/2024 { // Adjust the range to your data // Run the regression with z_FIP only for the current year regress rpg ops if year == `yr' // Store season r2s for ops local r2_1 = e(r2) replace Rops = `r2_1' if year == `yr' // Run the regression with z_FIP only for the current year regress rpg fwoba if year == `yr' // Store R2 from the first model in R21 for the current year local r2_1 = e(r2) replace Rfg = `r2_1' if year == `yr' // Run the regression with z_FIP only for the current year regress rpg swoba if year == `yr' // Store R2 from the first model in R21 for the current year local r2_1 = e(r2) replace Rwr = `r2_1' if year == `yr' // Run the regression with z_FIP only for the current year regress rpg rbat if year == `yr' // Store R2 from the first model in R21 for the current year local r2_1 = e(r2) replace Rbrb = `r2_1' if year == `yr' } preserve keep yearID Rfg Rwr Rops Rbrb gen Rfg_100=Rfg*100 gen Rwr_100=Rwr*100 gen Rops_100=Rops*100 gen Rbrb_100=Rbrb*100 twoway (lpolyci Rfg_100 year, lwidth(none) bwidth(10)) /// // Gray area for smoothed Rfg (lpoly Rfg_100 year, lcolor(black) lpattern(shortdash) lwidth(thin) bwidth(10)) /// // Black dashed lpoly line for smoothed Rfg (lpolyci Rops_100 year, lwidth(none) bwidth(10)) /// // Gray area for ci (lpoly Rops_100 year, lcolor(black) lpattern(shortdash) lwidth(thin) bwidth(10)) /// // Black dashed lpoly line for smoothed Rops (lpolyci Rwr_100 year, lwidth(none) bwidth(10)) /// // Gray area for ci (lpoly Rwr_100 year, lcolor(black) lpattern(shortdash) lwidth(thin) bwidth(10)) /// // Black dashed lpoly line for smoothed Rwr (lpolyci Rbrb_100 year, lwidth(none) bwidth(10)) /// // Gray area for ci (lpoly Rbrb_100 year, lcolor(black) lpattern(shortdash) lwidth(thin) bwidth(10)), /// // Black dashed lpoly line for smoothed Rbrb ytitle("R-squared (scaled x100)") /// xtitle("Year") /// ylabel(50(10)100, labsize(medium) nogrid) /// xlabel(1900(10)2024, angle(45) labsize(medium) nogrid) /// legend(off) /// graphregion(color(white)) /// plotregion(margin(zero)) restore //// Now examine individual hitter WOBAr clear use Lahman hitting data ///rename some variables rename AB ab rename R r rename H h rename B b2 rename L b3 rename HR hr rename RBI rbi rename SB sb rename CS cs rename BB bb rename IBB ibb rename HBP hbp rename SH sh rename SF sf rename G g rename SO so * drop pitchers merge m:1 playerID using "bb_ref_lahman_pitcher_names_match.dta", update replace keep if _merge==1 | playerID=="ruthba01" // drop all pitchers except Ruth drop _merge // drop seasons prior to 1900 drop if yearID<1900 //drop non AL/NL drop if lgID != "AL" & lgID != "NL" // aggregate players season performances variables over multiple stints collapse (sum) g ab r h b2 b3 hr rbi sb cs bb so ibb hbp sh sf , by(playerID yearID lgID) drop if ab==0 *** rename for consistency rename yearID yearid rename playerID playerid ** eliminate select steroid users drop if (playerid=="bondsba01" & yearid > 1996) drop if (playerid == "mcgwima01") drop if (playerid == "sosasa01") drop if (playerid == "cansejo01") drop if (playerid == "giambja01") drop if (playerid == "palmera01") drop if (playerid == "rodrial01") *** /// add 2024 patch merge m:m yearid playerid using "bb_ref_2024_season_hitting_coded" drop if ab ==. drop if ab==0 drop _merge /// merge swoba coefficients rename yearid yearID merge m:m yearID using "C:\Users\dmk38\Documents\x5\swoba_coefficients.dta" drop _merge rename yearID yearid // Calculate the obs for each player-year combination gen slg=(((h-(b2+b3+hr))+2*b2+3*b3+4*hr))/ab gen obp=(h+bb+hbp)/(ab+bb+hbp+sf) gen ops=slg+obp drop if ab ==. drop if ab==0 // calculate swoba for each player-year combo gen doub =b2 gen trip=b3 gen sing= h-(hr+doub+trip) gen hrPA=hr/(ab+bb+hbp+sf) gen tPA=trip/(ab+bb+hbp+sf) gen dPA=doub/(ab+bb+hbp+sf) gen sPA=sing/(ab+bb+hbp+sf) gen bbPA=bb/(ab+bb+hbp+sf) gen hbpPA=hbp/(ab+bb+hbp+sf) gen swoba=(hrPA*b_hrPA+tPA*b_tPA+dPA*b_dPA+sPA*b_sPA+bbPA*b_bbPA+hbpPA*b_hbpPA+b_cons) /// calculate weighted mean and SD for each season's swoba irrespective of league ; weighting by PAs egen total_pa = sum(ab+bb+hbp+sf), by(yearid) generate weighted_swoba_product = swoba * (ab+bb+hbp+sf) egen total_weighted_swoba = sum(weighted_swoba_product), by(yearid) generate weighted_mean_swoba = total_weighted_swoba / total_pa generate squared_diff_swoba = (swoba - weighted_mean_swoba)^2 egen total_squared_diff_swoba = sum(squared_diff_swoba *(ab+bb+hbp+sf)), by(yearid) generate weighted_variance_swoba = (total_squared_diff_swoba / (total_pa - 1)) generate weighted_sd_swoba = sqrt(weighted_variance_swoba) // Calculate the real PA (Plate Appearances) as the sum of the relevant totals gen PA = ab+bb+sh+sf+hbp // adjusted Gwynn 96 replace ab=495 if (yearid==1996 & playerid=="gwynnto01") // Determine batting title eligibility including ad hoc adjustments made by MLB... gen ba_elig = 0 replace ba_elig = 1 if (yearid < 1949 & g > 99) | /// (yearid >= 1950 & yearid <= 1956 & ab > 399) | /// (yearid >= 1957 & yearid <= 1960 & PA > 477) | /// (yearid > 1960 & PA > 502) | /// (playerid == "hargrbu01" & yearid == 1926) | /// (playerid == "lombaer01" & yearid == 1942) | /// (playerid == "madlobi01" & yearid == 1981) | /// (yearid==1996 & playerid=="gwynnto01") | /// (yearid==1969 & playerid=="carewro01") replace ba_elig = 1 if (yearid == 1918 & PA > 419 | yearid ==1919 & PA > 419) replace ba_elig = 1 if (yearid == 1972 & PA > 483) replace ba_elig = 1 if (yearid == 1981 & PA > 320 ) replace ba_elig = 1 if (yearid == 1981 & PA > 356 ) replace ba_elig=1 if (yearid==1994 & PA >356) replace ba_elig = 1 if (yearid== 2020 & PA > 185) // Calculate z-scores gen z_woba= . replace z_ops = (ops - weighted_mean_ops) / weighted_sd_ops if ba_elig == 1 1 replace z_swoba = (swoba - weighted_mean_swoba) / weighted_sd_swoba if ba_elig == 1 //delete batters not eligible for batting title drop if ba_elig == 0 /// center z_scores at 0 su z_swoba replace z_swoba=z_swoba- r(mean) /// create a scaled measure of swoba su z_swoba,d gen normal_swoba=((z_swoba-r(min))/(r(max)-r(min)))*100 /// sops+ su normal_swoba gen swoba_plus = 100*(normal_swoba/r(mean)) *** generate swoba performance report *** fix names rename (playerid yearid) (playerID yearID) merge m:m playerID using "lahman_names.dta" * Keep only the relevant observations where yearID is 2024 and nameFirst and nameLast are missing gen byte update_needed = (yearID == 2024 & missing(nameFirst) & missing(nameLast)) * Split the playerID variable into words (assuming a maximum of 3 words) split playerID, parse(" ") gen(word) * Update nameFirst and nameLast based on the split components for the relevant cases replace nameFirst = word1 if update_needed replace nameLast = word2 if !missing(word3) & update_needed replace nameLast = word2 + " " + word3 if !missing(word3) & update_needed replace nameLast = word2 if missing(word3) & update_needed //dump extraneous variables keep nameFirst nameLast yearID lgID swoba z_swoba swoba_plus drop if swoba=. export excel using "C:\Users\dmk38\Documents\x5\swoba_report.xls", firstrow(variables) replace