clear clear matrix set matsize 800 set mem 500m cd [your drive partition] use lahman pitching data ** back out hitter ABs * replace BAOpp = . if BAOpp == 9.99 gen abf= H/BAOpp drop if abf==. ** note: revisit calculation of abf for pitchers with missing opponent batting average values *** // sum data per pitcher collapse (sum) W L G GS CG SHO SV H ER HR BB SO IBB WP HBP BK BFP GF R GIDP IPouts abf, by(playerID yearID teamID lgID) // IP gen IP=IPouts/3 merge m:m playerID using [lahman names],update drop _merge // import BIP data merge m:m playerID yearID using [1930_1979 BIP] ,update drop _merge merge m:m playerID yearID using [1980_1999 BIP],update drop _merge merge m:m playerID yearID using [1912_1929 BIP],update drop _merge merge m:m playerID yearID using [2000-2024 BIP], update replace drop _merge replace teamID ="MIL" if teamID == "ML4" & year > 1997 replace teamID ="ML4" if teamID == "MIL" & year <1998 replace teamID="LAA" if teamID =="ANA" & year >2004 // merge 2024 BBR pitcher data merge m:m bbrefID yearID using [bbr 2024 pitcher],update drop _merge drop if yearID==. drop if IP==. drop if ifpop==. drop if SO ==. // not necessary but included for alternative model specifications gen AL=. replace AL=1 if lgID=="AL" replace AL=0 if lgID =="NL" replace AL=0 if teamID== "CHN" replace AL=0 if teamID== "WAS" replace AL=1 if teamID== "OAK" replace AL=0 if teamID== "SLN" replace AL=0 if teamID== "PIT" replace AL=1 if teamID== "LAA" replace AL=0 if teamID== "NYN" replace AL=1 if teamID== "MIN" replace AL=0 if teamID== "MIL" replace AL=1 if teamID== "HOU" replace AL=1 if teamID== "CLE" replace AL=0 if teamID== "PHI" replace AL=0 if teamID== "ATL" replace AL=1 if teamID== "TBA" replace AL=1 if teamID== "KCA" replace AL=1 if teamID== "DET" replace AL=1 if teamID== "SEA" replace AL=1 if teamID== "TEX" replace AL=1 if teamID== "NYA" replace AL=0 if teamID== "COL" replace AL=0 if teamID== "LAN" replace AL=1 if teamID== "BOS" replace AL=1 if teamID== "CHA" replace AL=0 if teamID== "SDN" replace AL=1 if teamID== "TOR" replace AL=1 if teamID== "BAL" replace AL=0 if teamID== "ARI" replace AL=0 if teamID== "SFN" replace AL=0 if teamID== "MIA" replace AL=0 if teamID== "CIN" replace lgID ="AL" if AL==1 & lgID=="" replace lgID ="NL" if AL==0 & lgID=="" /// merge bbref SH SF merge m:m bbrefID yearID using [bbr sf/sh data],update drop _merge drop if yearID==. replace abf=(H+(IP*3)) if year ==2024 ** add knuckleballer identifier gen kb=0 gen kb_plus=0 merge m:m playerID using "knuckleballers", update replace drop _merge drop if ifpop==. drop if yearID==. drop if SO ==. *** determine performance values as rates gen gb_pct_rs2 =GB/(GB+FB+LD+POP) gen k9=9*SO/IP gen bb9=9*BB/IP gen hr9=9*HR/IP gen hr_pip=HR/IP gen k_pip=SO/IP gen bb_pip=BB/IP gen hbp_pip=HBP/IP d gen rapg=9*R/IP gen obabip=(H-HR)/(abf-(SO+HR)+SF) gen gb_pct= gb/(gb+airb) gen ifgb_pip=ifgb/IP gen ifpop_pip=ifpop/IP gen airb_pip=airb/IP gen ofair_pip=ofair/IP gen bobw= ifpop_pip-ofair_pip gen gb_pct2= gb2/(gb2+airb2) gen ifgb_pip2=ifgb2/IP gen ifpop_pip2=ifpop2/IP gen airb_pip2=airb2/IP gen ofair_pip2=ofair2/IP gen bobw2= ifpop_pip2-ofair_pip2 gen ifair_pip=ifair/IP gen ifair_pip2=ifair/IP // season specific fipr gen fipr=. foreach y of numlist 1912/2024 { di "year `y'" regress rapg k_pip bb_pip hr_pip hbp_pip [iweight=IP] if year==`y' predict yhat replace fipr = yhat if year==`y' drop yhat } // merge field composite gen teamid = teamID merge m:m teamid yearID using "C:\Users\dmk38\Documents\x5\team_field_composite.dta" drop _merge // calculate bib_rba gen period=. replace period = 1912 if yearID < 1923 replace period= 1923 if yearID >= 1923 & yearID <1928 replace period = 1928 if yearID >= 1928 & yearID < 1935 replace period = 1935 if yearID >= 1935 & yearID < 1942 replace period = 1942 if yearID >= 1942 & yearID < 1955 replace period = 1955 if yearID >= 1955 & yearID < 1967 replace period = 1967 if yearID >= 1967 & yearID < 1975 replace period = 1975 if yearID >= 1975 & yearID < 1980 replace period = 1980 if yearID >= 1980 & yearID < 1990 replace period = 1990 if yearID >= 1990 & yearID < 1998 replace period = 1998 if yearID >= 1996 & yearID < 2002 replace period = 2002 if yearID >= 2002 & yearID < 2012 replace period = 2012 if yearID >= 2012 & yearID < 2017 replace period = 2017 if yearID >= 2017 drop if period==. ** get period parameters ** merge m:m period using [bib_rba parameter data] gen bip_rba=. gen predicted_avg_rapg=. gen predicted_rapg=. foreach p in 1912 1923 1928 1935 1942 1955 1967 1975 1980 1990 1998 2002 2012 2017 { replace predicted_rapg = cons + b_fipr*fipr + b_field*field_composite /// + b_ifpop*ifpop_pip + b_ofair*ofair_pip if period==`p' replace predicted_avg_rapg = cons + b_fipr*fipr + b_field*field_composite /// + b_ifpop*wvar_ifpop + b_ofair*wvar_ofair if period==`p' replace bip_rba= predicted_avg_rapg-predicted_rapg if period==`p' } *** calculate season bip runs suppressed gen season_bip_rs=bip_rba*IP/9 // fix 2024 names w/o Lahman id code * Split Player into first and last names split Player, parse(" ") gen(name_) * For two-word names, first is first name, last is last name replace nameFirst = name_1 if missing(nameFirst) * For last name, take everything after first word gen temp_last = subinstr(Player, name_1, "", 1) replace temp_last = trim(temp_last) // Remove leading/trailing spaces replace nameLast = temp_last if missing(nameLast) // save bip_rba season data export excel nameFirst nameLast year lgID IP bip_rba season_bip_rs using [season .xls], firstrow(variables) replace // save career bip runs suppressed preserve collapse (sum) season_bip_rs, by(nameFirst nameLast ) rename season_bip_rs career_ba export excel nameFirst nameLast career_ba using "bip_rbab_hp2b_career.xls", firstrow(variables) replace restore