clear clear matrix set matsize 800 set mem 500m cd /// your data directory here use lahman_batting_2023.dta // assumes file converted to .dta * drop pitchers merge m:1 playerID using "bb_ref_lahman_pitcher_names_match.dta", update replace // assumes file converte to .dta keep if _merge==1 | playerID=="ruthba01" // drop all pitchers except Ruth // drop seasons prior to 1900 drop if yearID<1900 //drop non AL/NL drop if lgID != "AL" & lgID != "NL" // rename doubles and triples rename B b2 rename L b3 // rename other variables to conform to previous version of database rename (G AB R H HR RBI SB CS BB SO IBB HBP SH SF) (g ab r h hr rbi sb cs bb so ibb hbp sh sf) rename yearID yearid rename playerID playerid rename lgID lgid // aggregate players season performances variables over multiple stints collapse (sum) g ab r h b2 b3 hr rbi sb cs bb so ibb hbp sh sf, by(playerid yearid lgid) drop if ab==0 // Calculate the batting average BA for each player-year combination gen BA = h/ab /// calculate weighted mean and SD for each season's BA irrespective of league generate weighted_BA_product = BA * ab egen total_weighted_BA = sum(weighted_BA_product), by(yearid) egen total_ab = sum(ab), by(yearid) generate weighted_mean_BA = total_weighted_BA / total_ab generate squared_diff = (BA - weighted_mean_BA)^2 egen total_squared_diff = sum(squared_diff * ab), by(yearid) generate weighted_variance_BA = (total_squared_diff / (total_ab - 1)) generate weighted_sd_BA = sqrt(weighted_variance_BA) // Calculate the PA (Plate Appearances) as the sum of the relevant totals gen PA = ab+bb+sh+sf // adjusted Gwynn 96: Gwynn was batting title eligible under rule that adds hitless ABs replace ab=495 if (yearid==1996 & playerid=="gwynnto01") // Determine batting title eligibility: include title winners recogized on ad hoc criteria.... gen ba_elig = 0 replace ba_elig = 1 if (yearid < 1949 & g > 99) | /// (yearid >= 1950 & yearid <= 1956 & ab > 399) | /// (yearid >= 1957 & yearid <= 1960 & PA > 477) | /// (yearid > 1960 & PA > 502) | /// (playerid == "hargrbu01" & yearid == 1926) | /// (playerid == "lombaer01" & yearid == 1942) | /// (playerid == "madlobi01" & yearid == 1981) | /// (yearid==1996 & playerid=="gwynnto01") | /// (yearid==1994 & PA >345) replace ba_elig = 1 if (yearid == 1918 & PA > 419 | yearid ==1919 & PA > 419) replace ba_elig = 1 if (yearid == 1972 & PA > 483) replace ba_elig = 1 if (yearid == 1981 & PA > 320 ) replace ba_elig = 1 if (yearid == 1981 & PA > 356 ) replace ba_elig = 1 if (yearid== 2020 & PA > 185) // Calculate z-score for eligible players gen ba_zscore = . replace ba_zscore = (BA - weighted_mean_BA) / weighted_sd_BA if ba_elig == 1 //delete batters not eligible for batting title drop if ba_elig == 0 //recenter ba_zscore: I think this makes sense since mean z-score derived from weighted data drifts from zero * Step 1: Calculate the mean of ba_zscore summarize ba_zscore local mean_ba_zscore = r(mean) * Step 2: Center ba_zscore at 0 replace ba_zscore = ba_zscore - `mean_ba_zscore' //normalize to 0 to 100 scale: an alternative way to normalize data w/ zero as lowest & 100 as highest //should be used in any event to enable generation of BA+ scale su ba_zscore gen normal_z_ba=((ba_zscore-r(min))/(r(max)-r(min)))*100 /// normalize to BA+ scale su normal_z_ba gen ba_plus = 100*(normal_z_ba/r(mean)) /// shrink data set to title winners * Step 1: Sort the data by yearid, lgid, and BA in descending order gsort yearid lgid -BA * Step 2: Generate a ranking variable by yearid lgid: gen rank = _n ** Step sqrt(5) substitute snuffy stirnweis for lazorjo01: the criteria for ba title eligibility seemed malleable... //Johnny Lazor should have won given then-previaling 100 game played criterion replace rank = 1 if playerid=="stirnsn01" & year==1945 drop if playerid=="lazorjo01" * Step 3: Filter the data to keep only the top BA value for each yearid and lgid keep if rank == 1 /// merge 2024 patch merge m:m yearid playerid lgid BA using "standardized_ba_2024_patch.dta" drop _merge * add lahman names rename playerid playerID merge m:m playerID using "lahman_names.dta" drop if ba_plus==. // fix 2024 batting title names * ID observations where yearID is 2024 and nameFirst and nameLast are missing gen byte update_needed = (yearID == 2024 & missing(nameFirst) & missing(nameLast)) * Split the playerID variable into words (maximum of 3 words) split playerID, parse(" ") gen(word) * Update nameFirst and nameLast based on the split components replace nameFirst = word1 if update_needed replace nameLast = word2 if !missing(word3) & update_needed replace nameLast = word2 + " " + word3 if !missing(word3) & update_needed replace nameLast = word2 if missing(word3) & update_needed * eliminate temporary variables drop word1 word2 word3 update_needed /// generate probability density plot for title-winners kdensity ba_plus, yscale(off) recast(area) xtitle("") xlabel(120(10)200, nogrid) scale(1) /// title ("") graphregion( color(white) ) lcolor(black) fcolor(white) ylabel(, nogrid noticks nolabels) xlabel(, nogrid ) /// lwidth(medium) xscale(range(120 200)) lpattern(solid) /// legend(off) note("") //export results to excel file export excel using [desired file name here], firstrow(variables) replace