clear clear matrix set matsize 800 set mem 500m cd [your disk partion] use lanham_pitching drop if year <1900 // trim data to necessaries collapse (sum) SO IPouts, by(playerID yearID teamID) // IP gen IP=IPouts/3 /// get Lahman names file for ... names and also integration of 2024 data merge m:m playerID using [lahman names file],update drop _merge /// bbref 2024 data merge m:m bbrefID yearID using [bbref 2024 data patch],update drop _merge drop if yearID==. /// threshold for calculating weighted means/sds drop if IP<100 drop if SO ==. *** determine performance values as rates gen k_9=9*SO/IP *********************************** * Population-Weighted Variance *********************************** * 1. Summarize IP within each year egen total_IP = sum(IP), by(year) * * 2. Compute weighted mean of k_9 generate double weighted_k_9_product = k_9 * IP egen double total_weighted_k_9 = sum(weighted_k_9_product), by(year) generate double weighted_mean_k_9 = total_weighted_k_9 / total_IP * 3. Compute weighted sum of squared deviations * (k_9 - weighted_mean_k_9)^2 * IP generate double wssd = (k_9 - weighted_mean_k_9)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_k_9 = total_wssd / total_IP * 5. Population weighted standard deviation generate double pk_9d_k_9 = sqrt(popvar_k_9) drop wssd weighted_k_9_product total_weighted_k_9 rename weighted_mean_k_9 wvar_k_9 rename pk_9d_k_9 wsd_k_9 drop total_wssd *** zscore gen z_k_9 = (k_9-wvar_k_9)/wsd_k_9 *** remove outliers if you want to seriously evaluate weighted means & sds! // Compute quartiles and IQR * summarize z_k_9, detail * scalar iqr = r(p75) - r(p25) * scalar lower_bound = r(p25) - 1.5 * iqr * scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values * replace z_k_9 = . if z_k_9 < lower_bound | z_k_9 > upper_bound // restandardize weighted scores zscore z_k_9 replace z_k_9=z_z_k_9 // put names in for 2024 players * Split Player into first and last names split Player, parse(" ") gen(name_) * For two-word names, first is first name, last is last name replace nameFirst = name_1 if missing(nameFirst) * For last name, take everything after first word gen temp_last = subinstr(Player, name_1, "", 1) replace temp_last = trim(temp_last) // Remove leading/trailing spaces replace nameLast = temp_last if missing(nameLast) drop name_* temp_last /// no new variables /// drop if year == 2020 /// avoid biasing based on shortened season if looking at means & sd twoway /// (scatter wsd_k_9 year, mcolor(blue)) (lfit wsd_k_9 year, /// /// <-- bandwidth (adjust as needed) lcolor(black) lwidth(medthick)) /// , /// xlabel(1900(10)2024, nogrid format(%02.0f) angle(45)) /// ylabel(, nogrid) /// ytitle("weighted SD") /// xtitle("year") /// graphregion(color(white)) /// plotregion(style(none)) /// xscale(r(1900 2024)) /// legend(off) title("Pitcher Ks per IP") twoway /// (scatter wvar_k_9 year, mcolor(blue)) (lfit wvar_k_9 year, /// /// <-- bandwidth (adjust as needed) lcolor(black) lwidth(medthick)) /// , /// xlabel(1900(10)2024, nogrid format(%02.0f) angle(45)) /// ylabel(, nogrid) /// ytitle("weighted mean") /// xtitle("year") /// graphregion(color(white)) /// plotregion(style(none)) /// xscale(r(1900 2024)) /// legend(off) title("Pitcher Ks per IP") pwcorr year wsd_k_9 wvar_k_9 // limit to ERA qualifiers drop if IP <154 & year <1961 drop if IP < 162 & year > 1960 // create sk9plus score from 0 to 100 su z_k_9,d gen sk9_plus =(( z_k_9-r(min))/(r(max)-r(min)))*100 egen sk9_rank = rank(-z_k_9) /// generate a nice output file export excel sk9_rank nameFirst nameLast yearID SO k_9 sk9_plus z_k_9 using "[file location name]xls", /// firstrow(variables) replace