clear clear matrix set matsize 800 set mem 500m cd [your drive partition] use lanham_pitching.dta drop if year <1911 collapse (sum) W L G GS CG SHO SV H ER HR BB SO IBB WP HBP BK BFP GF R GIDP IPouts , by(playerID yearID teamID lgID) // IP gen IP=IPouts/3 drop if IP <100 // merge 2024 pitcher merge m:m playerID using "lanham_names.dta",update drop _merge gen Player="" merge m:m bbrefID yearID using "bb_ref_2024_fipr_essentials.dta",update replace drop _merge drop if yearID==. rename Player player // drop steroid freak drop if playerID =="clemero02" drop if yearID <1911 collapse (sum) SO HR IP R BB IPouts HBP ,by (nameFirst nameLast bbrefID playerID yearID lgID) *** determine performance values as rates gen k_pip=SO/IP gen bb_pip=BB/IP gen hr_pip=HR/IP gen hbp_pip=HBP/IP gen rapg=R/IP *9 gen hr9=9*HR/IP gen bb9=9*BB/IP gen k9=9*SO/IP gen fipr=. local season=1911 while `season' <2020 { di `season' regress rapg k_pip bb_pip hr_pip hbp_pip [pweight=IP] if `season'==year predict pfipr if `season'==year replace fipr=pfipr if `season'==year drop pfipr local season = `season'+1 } local season=2021 while `season' <2025 { di `season' regress rapg k_pip bb_pip hr_pip hbp_pip [pweight=IP] if `season'==year predict pfipr if `season'==year replace fipr=pfipr if `season'==year drop pfipr local season = `season'+1 } drop if year ==2020 *********************************** * Population-Weighted Variance Code *********************************** * 1. Summarize ip within each year egen total_ip = sum(IP), by(yearID) * 2. Compute weighted mean of fipr generate double weighted_fipr_product = fipr * IP egen double total_weighted_fipr = sum(weighted_fipr_product), by(yearID) generate double weighted_mean_fipr = total_weighted_fipr / total_ip * 3. Compute weighted sum of squared deviations * (fipr - weighted_mean_fipr)^2 * ip generate double wssd = (fipr - weighted_mean_fipr)^2 * IP egen double total_wssd = sum(wssd), by(yearID) * 4. population variance: divide by total weight generate double popvar_fipr = total_wssd / total_ip * 5. Population weighted standard deviation generate double pfiprd_fipr = sqrt(popvar_fipr) drop wssd weighted_fipr_product total_weighted_fipr rename weighted_mean_fipr wvar_fipr rename pfiprd_fipr wsd_fipr drop total_wssd gen z_fipr = (fipr- wvar_fipr)/wsd_fipr // Compute quartiles and IQR summarize z_fipr, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values // replace z_fipr = . if z_fipr < lower_bound | z_fipr > upper_bound replace z_fipr=. if z_fipr<-3.25 | z_fipr >3.25 zscore z_fipr replace z_fipr=z_z_fipr*-1 twoway (scatter wvar_fipr year, mcolor(blue)) /// (lfit wvar_fipr year, lcolor(black)) /// , xlabel(1910(10)2020, nogrid format(%02.0f) angle(45)) /// ylabel(3(.5)6, nogrid) /// ytitle("") xtitle("Season") /// graphregion(color(white)) /// plotregion(style(none)) /// xscale(r(1910 2020)) /// legend(off) twoway (scatter wsd_fipr year, mcolor(blue)) /// (lfit wsd_fipr year, lcolor(black)) /// , xlabel(1910(10)2020, nogrid format(%02.0f) angle(45)) /// ylabel(, nogrid) /// ytitle("") xtitle("Season") /// graphregion(color(white)) /// plotregion(style(none)) /// xscale(r(1910 2020)) /// legend(off) su z_fipr,d pwcorr year wsd_fipr wvar_fipr ***************************************************** * FIR_R2 ***************************************************** clear clear matrix set matsize 800 set mem 500m cd [your drive partition] use lahman_team_hitting_pitching.dta drop if yearID <1900 // merge 2024 season data from bb_reference merge m:m yearID teamID using "bb_ref_2024_patch.dta" drop _merge // runs per game & runs allowed per game gen rpg=(R/IPouts)*27 gen rapg=(RA/IPouts)*27 ***************************************************** * Single-season regression models ***************************************************** gen R21=. foreach yr of numlist 1900/2024 { // Adjust the range to your data di `yr' regress RA FIP if year == `yr' // Store R2 from the first model in R21 for the current year local r2_1 = e(r2) replace R21 = `r2_1' if year == `yr' } * Collapse to get mean R2 by year collapse (mean) R21 , by(year) * convert dataset to time series based on year sort year tsset year * * Multiply the y-axis variables by 100 for % scaling gen R21_scaled=R21*100 * Plot figure 1 * Plot with gray-filled area for R22 and dashed line for R21 twoway (lpoly R21_scaled year, lcolor(black) lpattern(longdash) lwidth(medium) bw(5)), /// // Black dashed lpoly line for smoothed R21 ytitle("R-squared (scaled x100)") /// xtitle("Year") /// ylabel(0(10)100, labsize(medium) nogrid) /// xlabel(1900(10)2024, angle(45) labsize(medium) nogrid) /// legend(off) /// graphregion(color(white)) /// plotregion(margin(zero))