clear clear matrix set matsize 800 set mem 500m cd [your directory partition] use lahman pitching ** back out hitter ABs * drop if year <1911 replace BAOpp = . if BAOpp == 9.99 gen abf= H/BAOpp drop if abf==. // sum data per pitcher collapse (sum) W L G GS CG SHO SV H ER HR BB SO IBB WP HBP BK BFP GF R GIDP IPouts abf, by(playerID yearID teamID lgID) // IP gen IP=IPouts/3 drop if IP <100 // merge 2024 pitcher merge m:m playerID using [lahman names],update drop _merge gen Player="" merge m:m bbrefID yearID using bb_ref_2024_FIP_elements,update drop _merge drop if yearID==. renam Player player /// merge bb_ref all-time WAR merge m:m bbrefID yearID using bbr_single_season_pitcher_war_100IP,update drop _merge drop if lgID !="AL" & lgID !="NL" /// fix names * Keep only the relevant observations where nameFirst and nameLast are missing gen byte update_needed = (missing(nameFirst) & missing(nameLast)) * Split the playerID variable into words (assuming a maximum of 3 words) split player, parse(" ") gen(word) * Update nameFirst and nameLast based on the split components for the relevant cases replace nameFirst = word1 if update_needed replace nameLast = word2 if !missing(word3) & update_needed replace nameLast = word2 + " " + word3 if !missing(word3) & update_needed replace nameLast = word2 if missing(word3) & update_needed drop word1 word2 word3 update_needed // drop steroid freak drop if playerID =="clemero02" drop if yearID <1911 // condense collapse (sum) SO HR IP R BB IPouts HBP war (mean) bbr_fip,by (nameFirst nameLast bbrefID playerID yearID lgID) *** determine performance values as rates gen k_pip=SO/IP gen bb_pip=BB/IP gen hr_pip=HR/IP gen hbp_pip=HBP/IP gen rapg=R/IP *9 gen hr9=9*HR/IP gen bb9=9*BB/IP gen k9=9*SO/IP // generate FIPr (exclude 2020 to avoid distortion of SD due to short season) gen fipr=. local season=1911 while `season' <2020 { di `season' regress rapg k_pip bb_pip hr_pip hbp_pip [pweight=IP] if `season'==year predict pfipr if `season'==year replace fipr=pfipr if `season'==year drop pfipr local season = `season'+1 } local season=2021 while `season' <2025 { di `season' regress rapg k_pip bb_pip hr_pip hbp_pip [pweight=IP] if `season'==year predict pfipr if `season'==year replace fipr=pfipr if `season'==year drop pfipr local season = `season'+1 } drop if year ==2020 /// weight dta egen total_IP = sum(IP), by(yearID) *********************************** * Population-Weighted Variance Code *********************************** * 1. Summarize ip within each year egen total_ip = sum(IP), by(yearID) * 2. Compute weighted mean of fipr generate double weighted_fipr_product = fipr * IP egen double total_weighted_fipr = sum(weighted_fipr_product), by(yearID) generate double weighted_mean_fipr = total_weighted_fipr / total_ip * 3. Compute weighted sum of squared deviations * (fipr - weighted_mean_fipr)^2 * ip generate double wssd = (fipr - weighted_mean_fipr)^2 * IP egen double total_wssd = sum(wssd), by(yearID) * 4. population variance: divide by total weight generate double popvar_fipr = total_wssd / total_ip * 5. Population weighted standard deviation generate double pfiprd_fipr = sqrt(popvar_fipr) drop wssd weighted_fipr_product total_weighted_fipr rename weighted_mean_fipr wvar_fipr rename pfiprd_fipr wsd_fipr drop total_wssd * 2. Compute weighted mean of bbr_fip generate double weighted_bbr_fip_product = bbr_fip * IP egen double total_weighted_bbr_fip = sum(weighted_bbr_fip_product), by(yearID) generate double weighted_mean_bbr_fip = total_weighted_bbr_fip / total_ip * 3. Compute weighted sum of squared deviations * (bbr_fip - weighted_mean_bbr_fip)^2 * ip generate double wssd = (bbr_fip - weighted_mean_bbr_fip)^2 * IP egen double total_wssd = sum(wssd), by(yearID) * 4. population variance: divide by total weight generate double popvar_bbr_fip = total_wssd / total_ip * 5. Population weighted standard deviation generate double pbbr_fipd_bbr_fip = sqrt(popvar_bbr_fip) drop wssd weighted_bbr_fip_product total_weighted_bbr_fip rename weighted_mean_bbr_fip wvar_bbr_fip rename pbbr_fipd_bbr_fip wsd_bbr_fip drop total_wssd // Calculate z scores gen z_fipr = (fipr- wvar_fipr)/wsd_fipr // Compute quartiles and IQR summarize z_fipr, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values // replace z_fipr = . if z_fipr < lower_bound | z_fipr > upper_bound replace z_fipr=. if z_fipr<-4 | z_fipr >4 zscore z_fipr replace z_fipr=z_z_fipr*-1 //cacluate gen z_bbref_fip = (bbr_fip- wvar_bbr_fip)/wsd_bbr_fip // Compute quartiles and IQR summarize z_bbref_fip, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values // replace z_bbref_fip = . if z_bbref_fip < lower_bound | z_bbref_fip > upper_bound replace z_bbref_fip=. if z_bbref_fip<-4.2 | z_bbref_fip >4.2 zscore z_bbref_fip replace z_bbref_fip=z_z_bbref_fip*-1 /// plot mean & sd over time twoway /// (scatter wvar_fipr year, mcolor(ltblue)) (lpoly wvar_fipr year, /// bw(3) /// <-- bandwidth (adjust as needed) lcolor(black) lwidth(medthick)) /// , /// xlabel(1910(10)2024, nogrid format(%02.0f) angle(45)) /// ylabel(3(.5)5, nogrid) /// ytitle("") /// xtitle("Season") /// graphregion(color(white)) /// plotregion(style(none)) /// xscale(r(1910 2024)) /// legend(off) twoway /// (scatter wsd_fipr year, mcolor(ltblue)) (lpoly wsd_fipr year, /// bw(3) /// <-- bandwidth (adjust as needed) lcolor(black) lwidth(medthick)) /// , /// xlabel(1910(10)2024, nogrid format(%02.0f) angle(45)) /// ylabel(, nogrid) /// ytitle("") /// xtitle("Season") /// graphregion(color(white)) /// plotregion(style(none)) /// xscale(r(1910 2024)) /// legend(off) ///merge fg war rename bbrefID bbrefid merge m:m bbrefid yearID using fg_pitcher_war_100IP,update drop _merge // drop steroid freak drop if bbrefid =="clemero02" drop if yearID<1911 /// filter to ERA eligible drop if IP < 154 & year < 1961 drop if IP <162 & year >1960 drop if lgID=="" //condense collapse IP fipr z_fipr z_bbref_fip war bbr_fip fg_war,by (nameFirst nameLast bbrefid playerID yearID lgID fg_id) rename war bbr_war keep nameFirst nameLast playerID yearID lgID IP fipr z_fipr z_bbref_fip bbrefid bbr_war bbr_fip fg_war fg_id // gen ranking codes * Rank z_fipr in descending order egen rank_z_fipr = rank(-z_fipr) *rank fg_war in descending order egen rank_fg_war = rank(-fg_war) * Rank z_bbref_fip in descending order egen rank_z_bbref_fip = rank(-z_bbref_fip) egen rank_bbr_war = rank(-bbr_war) sort rank_z_fipr // export file export excel rank_z_fipr rank_z_bbref_fip rank_bbr_war rank_fg_war nameFirst nameLast yearID lgID IP fipr z_fipr z_bbref_fip bbr_fip bbr_war fg_war bbrefid fg_id playerID using chosen file name, /// firstrow(variables) replace pwcorr z_fipr fg_war bbr_war z_bbref_fip clear clear matrix set matsize 800 set mem 500m cd your directory use Lahaman team data // add opponent hit by pitch data extracted from individ pitcher totals merge m:m yearID teamID using Lahman_opp_hbp //drop non AL/NL drop if lgID != "AL" & lgID != "NL" rename oHBP HBPA drop _merge // merge FIP & 2024 team data merge m:m yearID teamID using bb_ref_2024_patch, update drop _merge // runs per game & runs allowed per game gen rpg=(R/IPouts)*27 gen rapg=(RA/IPouts)*27 ** standardize bysort yearID: egen mean_rapg = mean(rapg) bysort yearID: egen sd_rapg = sd(rapg) gen z_rapg = (rapg - mean_rapg) / sd_rapg bysort yearID: egen mean_RA = mean(RA) bysort yearID: egen sd_RA = sd(RA) gen z_RA = (RA - mean_RA) / sd_RA *** fipr replace IP=IPouts/3 if IP==. gen k_pip=SOA/IP gen fipr=. gen bb_pip=BBA/IP gen hr_pip=HRA/IP gen hbp_pip=HBPA/IP foreach i of numlist 1911/2024 { di `i' regress rapg k_pip hr_pip hbp_pip if year==`i' predict prfipr replace fipr = prfipr if year ==`i' drop prfipr } ** standardize bysort yearID: egen mean_fipr = mean(fipr) bysort yearID: egen sd_fipr = sd(fipr) gen z_fipr = (fipr - mean_fipr) / sd_fipr bysort yearID: egen mean_k_pip = mean(k_pip) bysort yearID: egen sd_k_pip = sd(k_pip) gen z_k_pip = (k_pip - mean_k_pip) / sd_k_pip bysort yearID: egen mean_bb_pip = mean(bb_pip) bysort yearID: egen sd_bb_pip = sd(bb_pip) gen z_bb_pip = (bb_pip - mean_bb_pip) / sd_bb_pip bysort yearID: egen mean_hr_pip = mean(hr_pip) bysort yearID: egen sd_hr_pip = sd(hr_pip) gen z_hr_pip = (hr_pip - mean_hr_pip) / sd_hr_pip bysort yearID: egen mean_hbp_pip = mean(hbp_pip) bysort yearID: egen sd_hbp_pip = sd(hbp_pip) gen z_hbp_pip = (hbp_pip - mean_hbp_pip) / sd_hbp_pip //merge fg wars rename teamID teamid merge m:m yearID teamid using fg_team_wars drop _merge drop if lgID=="" drop if yearID<1900 /// standardize fgpwar bysort yearID: egen mean_fgpwar = mean(fgpwar) bysort yearID: egen sd_fgpwar = sd(fgpwar) gen z_fgpwar = (fgpwar - mean_fgpwar) / sd_fgpwar /// merge BBREF pwar merge m:m yearID teamid using bbref_team_pitching_wars drop _merge /// standardize pwar bysort yearID: egen mean_pwar = mean(pwar) bysort yearID: egen sd_pwar = sd(pwar) gen z_pwar = (pwar - mean_pwar) / sd_pwar /// pwar gen pwpg=pwar/G bysort yearID: egen mean_pwpg = mean(pwpg) bysort yearID: egen sd_pwpg = sd(pwpg) gen z_pwpg = (pwpg - mean_pwpg) / sd_pwpg /// fg war per game gen fgpwpg=fgpwar/G // standardize FIP bysort yearID: egen mean_FIP = mean(FIP) bysort yearID: egen sd_FIP = sd(FIP) gen z_FIP = (FIP - mean_FIP) / sd_FIP /// examine relative explanatory power of pitching profciency metrics over modern era gen Ri_FIP = . gen Ri_pwpg=. gen Ri_fgpwpg=. gen Ri_fipr=. foreach yr of numlist 1911/2024 { regress rapg FIP if year == `yr' replace Ri_FIP = e(r2) if year == `yr' regress rapg pwpg if year == `yr' replace Ri_pwpg = e(r2) if year == `yr' regress rapg fgpwpg if year == `yr' replace Ri_fgpwpg = e(r2) if year == `yr' regress rapg fipr if year == `yr' replace Ri_fipr = e(r2) if year == `yr' } // scale for % variance explained gen Ri_FIP_100=Ri_FIP*100 gen Ri_pwpg_100=Ri_pwpg*100 gen Ri_fgpwpg_100=Ri_fgpwpg*100 gen Ri_fipr_100=Ri_fipr*100 twoway (lpolyci Ri_fgpwpg_100 year, fcolor(gs12%35) alcolor(gs12%1) lwidth(none) bwidth(10)) /// // Gray area for ci (lpoly Ri_fgpwpg_100 year, lcolor(black) lpattern(shortdash) lwidth(thin) bwidth(10)) /// (lpolyci Ri_FIP_100 year, fcolor(green%35) alcolor(green%1) lwidth(none) bwidth(10)) /// (lpoly Ri_FIP_100 year, lcolor(green) lpattern(shortdash) lwidth(thin) bwidth(10)) /// // Black dashed lpoly line for smoothed Rfg (lpolyci Ri_fipr_100 year, fcolor(blue%35) alcolor(blue%1) lwidth(none) bwidth(10)) /// (lpoly Ri_fipr_100 year, lcolor(blue) lpattern(shortdash) lwidth(thin) bwidth(10)) /// // Black dashed lpoly line for smoothed Rfg (lpolyci Ri_pwpg_100 year, fcolor(red%35) alcolor(red%1) lwidth(none) bwidth(10)) /// // Gray area for ci (lpoly Ri_pwpg_100 year, lcolor(red) lpattern(shortdash) lwidth(thin) bwidth(10)) , /// // Black dashed lpoly line for smoothed Rops ytitle("R-squared (scaled x100)") /// xtitle("Year") /// ylabel(0 "0%" 10 "10%" 20 "20%" 30 "30%" 40 "40%" 50 "50%" 60 "60%" 70 "70%" 80 "80%" 90 "90%" 100 "100%", labsize(medium) nogrid) /// xlabel(1910(10)2024, angle(45) labsize(medium) nogrid) /// legend(off) /// graphregion(color(white)) /// plotregion(margin(zero)) regress z_rapg z_fipr regress z_rapg z_FIP regress z_rapg z_fgpwar regress z_rapg z_pwar