clear clear matrix set matsize 800 set mem 500m cd <<< enter your working directory>>> use <<< lahman_batting_2023 excel file reformatted as .dta>>> ///rename some variables rename AB ab rename R r rename H h rename B b2 rename L b3 rename HR hr rename RBI rbi rename SB sb rename CS cs rename BB bb rename IBB ibb rename HBP hbp rename SH sh rename SF sf rename G g rename SO so * drop pitchers merge m:1 playerID using <> converted to .dta, update replace keep if _merge==1 | playerID=="ruthba01" // drop all pitchers except Ruth // drop seasons prior to 1900 drop if yearID<1900 //drop non AL/NL drop if lgID != "AL" & lgID != "NL" // aggregate players season performances variables over multiple stints collapse (sum) g ab r h b2 b3 hr rbi sb cs bb so ibb hbp sh sf, by(playerID yearID lgID) drop if ab==0 *** rename for consistency rename yearID yearid rename playerID playerid ** eliminate select steroid users drop if (playerid=="bondsba01" & yearid > 1996) drop if (playerid == "mcgwima01") drop if (playerid == "sosasa01") drop if (playerid == "cansejo01") drop if (playerid == "giambja01") drop if (playerid == "palmera01") drop if (playerid == "rodrial01") *** // Calculate the slg obp & ops for each player-year combination gen slg=(((h-(b2+b3+hr))+2*b2+3*b3+4*hr))/ab gen obp=(h+bb+hbp)/(ab+bb+hbp+sf) gen ops=slg+obp /// add 2024 patch merge m:m yearid playerid lgID slg ops obp using <<< 2024_standardized_prod_patch.xlsx converted to .dta>>> drop if ab ==. drop if ab==0 drop _merge /// calculate weighted mean and SD for each season's slg opb ops irrespective of league egen total_ab = sum(ab), by(yearid) generate weighted_slg_product = slg * ab egen total_weighted_slg = sum(weighted_slg_product), by(yearid) generate weighted_mean_slg = total_weighted_slg / total_ab generate squared_diff_slg = (slg - weighted_mean_slg)^2 egen total_squared_diff_slg = sum(squared_diff_slg * ab), by(yearid) generate weighted_variance_slg = (total_squared_diff_slg / (total_ab - 1)) generate weighted_sd_slg = sqrt(weighted_variance_slg) generate weighted_obp_product = obp * ab egen total_weighted_obp = sum(weighted_obp_product), by(yearid) generate weighted_mean_obp = total_weighted_obp / total_ab generate squared_diff_obp = (obp - weighted_mean_obp)^2 egen total_squared_diff_obp = sum(squared_diff_obp * ab), by(yearid) generate weighted_variance_obp = (total_squared_diff_obp / (total_ab - 1)) generate weighted_sd_obp = sqrt(weighted_variance_obp) generate weighted_ops_product = ops * ab egen total_weighted_ops = sum(weighted_ops_product), by(yearid) generate weighted_mean_ops = total_weighted_ops / total_ab generate squared_diff_ops = (ops - weighted_mean_ops)^2 egen total_squared_diff_ops = sum(squared_diff_ops * ab), by(yearid) generate weighted_variance_ops = (total_squared_diff_ops / (total_ab - 1)) generate weighted_sd_ops = sqrt(weighted_variance_ops) // Calculate the PA (Plate Appearances) as the sum of the relevant totals gen PA = ab+bb+sh+sf+hbp // adjusted Gwynn 96 replace ab=495 if (yearid==1996 & playerid=="gwynnto01") // Determine batting title eligibility including ad hoc adjustments made by MLB... gen ba_elig = 0 replace ba_elig = 1 if (yearid < 1949 & g > 99) | /// (yearid >= 1950 & yearid <= 1956 & ab > 399) | /// (yearid >= 1957 & yearid <= 1960 & PA > 477) | /// (yearid > 1960 & PA > 502) | /// (playerid == "hargrbu01" & yearid == 1926) | /// (playerid == "lombaer01" & yearid == 1942) | /// (playerid == "madlobi01" & yearid == 1981) | /// (yearid==1996 & playerid=="gwynnto01") | /// (yearid==1969 & playerid=="carewro01") replace ba_elig = 1 if (yearid == 1918 & PA > 419 | yearid ==1919 & PA > 419) replace ba_elig = 1 if (yearid == 1972 & PA > 483) replace ba_elig = 1 if (yearid == 1981 & PA > 320 ) replace ba_elig = 1 if (yearid == 1981 & PA > 356 ) replace ba_elig=1 if (yearid==1994 & PA >356) replace ba_elig = 1 if (yearid== 2020 & PA > 185) // Calculate z-scores gen slg_zscore = . gen ops_zscore = . gen obp_zscore = . replace slg_zscore = (slg - weighted_mean_slg) / weighted_sd_slg if ba_elig == 1 replace obp_zscore = (obp - weighted_mean_obp) / weighted_sd_obp if ba_elig == 1 replace ops_zscore = (ops - weighted_mean_ops) / weighted_sd_ops if ba_elig == 1 //delete batters not eligible for batting title drop if ba_elig == 0 /// center z_scores at 0 su slg_zscore replace slg_zscore=slg_zscore - r(mean) su obp_zscore replace obp_zscore=obp_zscore - r(mean) su ops_zscore replace ops_zscore=ops_zscore - r(mean) // rescale values from 0 to 100 su ops_zscore,d gen normal_zops=((ops_zscore-r(min))/(r(max)-r(min)))*100 /// sops+ su normal_zops gen sops_plus = 100*(normal_zops/r(mean)) rename (playerid yearid) (playerID yearID) merge m:m playerID using <> /// use bbref code for player names * Keep only the relevant observations where yearID is 2024 and nameFirst and nameLast are missing gen byte update_needed = (yearID == 2024 & missing(nameFirst) & missing(nameLast)) * Split the playerID variable into words (assuming a maximum of 3 words) split playerID, parse(" ") gen(word) * Update nameFirst and nameLast based on the split components for the relevant cases replace nameFirst = word1 if update_needed replace nameLast = word2 if !missing(word3) & update_needed replace nameLast = word2 + " " + word3 if !missing(word3) & update_needed replace nameLast = word2 if missing(word3) & update_needed * Clean up temporary variables drop word1 word2 word3 update_needed *** drop extraneous variables keep yearID lgID slg obp ops ops_zscore sops_plus nameFirst nameLast drop if yearID ==. // export export excel using "[choose file name]", firstrow(variables) replace