clear clear matrix set matsize 800 set mem 500m cd [your partition use lanham_pitching.dta drop if year <1900 ** back out hitter ABs * replace BAOpp = . if BAOpp == 9.99 gen abf= H/BAOpp // sum data per pitcher collapse (sum) W L G GS CG SHO SV H ER HR BB SO IBB WP HBP BK BFP GF R GIDP IPouts abf, by(playerID yearID teamID lgID) // IP gen IP=IPouts/3 merge m:m playerID using lahman names data drop _merge drop if year <1900 *** 2024 pitcher data *** merge m:m bbrefID yearID using bbr 2024 pitcher data,update drop _merge drop if yearID==. drop if IP==. drop if SO ==. *** determine performance values as rates gen k9=9*SO/IP gen bb9=9*BB/IP gen hr9=9*HR/IP gen hr_pip=HR/IP gen k_pip=SO/IP gen bb_pip=BB/IP gen hbp_pip=HBP/IP gen rapg=9*R/IP *** calculate fipr *** gen fipr=. foreach y of numlist 1900/2024 { di "year `y'" regress rapg k_pip bb_pip hr_pip hbp_pip [iweight=IP] if year==`y' predict yhat replace fipr = yhat if year==`y' drop yhat } ** standardize seasopn fiprs *** *********************************** * Population-Weighted Variance Code *********************************** * 1. Summarize ip within each year egen total_ip = sum(IP), by(yearID) * 2. Compute weighted mean of fipr generate double weighted_fipr_product = fipr * IP egen double total_weighted_fipr = sum(weighted_fipr_product), by(yearID) generate double weighted_mean_fipr = total_weighted_fipr / total_ip * 3. Compute weighted sum of squared deviations * (fipr - weighted_mean_fipr)^2 * ip generate double wssd = (fipr - weighted_mean_fipr)^2 * IP egen double total_wssd = sum(wssd), by(yearID) * 4. population variance: divide by total weight generate double popvar_fipr = total_wssd / total_ip * 5. Population weighted standard deviation generate double pfiprd_fipr = sqrt(popvar_fipr) * 6. [Optional] Clean up intermediate variiples drop wssd weighted_fipr_product total_weighted_fipr rename weighted_mean_fipr wvar_fipr rename pfiprd_fipr wsd_fipr drop total_wssd total_ip gen z_fipr = (fipr- wvar_fipr)/wsd_fipr /// drop outliers // Compute quartiles and IQR summarize z_fipr, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values // replace z_fipr = . if z_fipr < lower_bound | z_fipr > upper_bound replace z_fipr=. if z_fipr<-3.25 | z_fipr >3.25 zscore z_fipr replace z_fipr=z_z_fipr*-1 sum z_fipr,d *** career fipr plus *** // weight each pitcher's season fipr by season IP / career IP * 1. Summarize ip within each year egen total_ip = sum(IP), by(playerID) * 2. Compute weighted mean of z_fipr generate double weighted_z_fipr_product = z_fipr * IP egen double total_weighted_z_fipr = sum(weighted_z_fipr_product), by(playerID) generate double weighted_mean_z_fipr = total_weighted_z_fipr / total_ip * 2. Compute weighted mean of fipr generate double weighted_fipr_product = fipr * IP egen double total_weighted_fipr = sum(weighted_fipr_product), by(playerID) generate double weighted_mean_fipr = total_weighted_fipr / total_ip * 3. Compute weighted sum of squared deviations * (z_fipr - weighted_mean_z_fipr)^2 * ip generate double wssd = (z_fipr - weighted_mean_z_fipr)^2 * IP egen double total_wssd = sum(wssd), by(playerID) rename weighted_mean_z_fipr z_fipr_avg drop if total_ip < 1000 collapse fipr z_fipr_avg total_ip, by(nameLast nameFirst ) gsort -z_fipr_avg sum z_fipr_avg,d ** normalize on mean 100 plus scale gen fipr_plus_norm = 100 * (z_fipr_avg - r(min)) / (r(max) - r(min)) sum fipr_plus_norm,d gen fipr_plus = (fipr_plus_norm/r(mean))*100 egen rank_fipr_plus_desc = rank(-fipr_plus) sort rank format fipr %9.2f format total_ip %9.0f format fipr_plus %9.0f rename fipr career_fipr rename total_ip IP export excel rank nameFirst nameLast IP career_fipr fipr_plus using "standardized_career_fipr.xls", firstrow(variables) replace