clear clear matrix set matsize 800 set mem 500m cd your partition use [lanham pitching data] ** back out hitter ABs * replace BAOpp = . if BAOpp == 9.99 gen abf= H/BAOpp drop if abf==. // sum data per pitcher collapse (sum) W L G GS CG SHO SV H ER HR BB SO IBB WP HBP BK BFP GF R GIDP IPouts abf, by(playerID yearID teamID) // IP gen IP=IPouts/3 // merege Lahman names to integrate 2024 data merge m:m playerID using [lahman names],update drop _merge /// BIP files merge m:m playerID yearID using 1930-79 BIP data,update drop _merge merge m:m playerID yearID using 1980-99 BIP data,update drop _merge merge m:m playerID yearID using 1912-29 BIP data ,update drop _merge merge m:m playerID yearID using 2000-2024 BIP data, update replace drop _merge // correct a possible glitch in Lahman team-name key for old retrosheet; I think this was fixed upstream but let's be sure replace teamID ="MIL" if teamID == "ML4" & year > 1997 replace teamID ="ML4" if teamID == "MIL" & year <1998 replace teamID="LAA" if teamID =="ANA" & year >2004 /// merge bbref 2024 pitcher data merge m:m bbrefID yearID using bb_ref_2024_pitcher_patch,update drop _merge drop if yearID==. /// merge bbref SH SF for obabip calculations merge m:m bbrefID yearID using "bb_ref_pitcher_sh_sf.dta",update drop _merge drop if yearID==. replace abf=(H+(IP*3)) if year ==2024 *** add knuckleballer identifier gen kb=0 gen kb_plus=0 merge m:m playerID using "knuckleballers", update replace ** kb is ever relied on knuckler ** kb_plus is principally relied for at least portion of career drop _merge /// narrow to ip 100+ drop if IP<100 drop if ifpop==. drop if SO ==. /// abf for 2024 replace abf=(H+(IP*3)) if year ==2024 *** determine performance values as rates gen k_pip=SO/IP gen bb_pip=BB/IP gen hr_pip=HR/IP gen hbp_pip=HBP/IP drop if IP <100 gen rapg=9*R/IP gen obabip=(H-HR)/(abf-(SO+HR)+SF) gen gb_pct= gb/(gb+airb) gen ifgb_pip=ifgb/IP gen ifpop_pip=ifpop/IP gen airb_pip=airb/IP gen ofair_pip=ofair/IP // fipr scores regress rapg k_pip bb_pip hr_pip hbp_pip [pweight=IP] predict fipr // merge team fielding gen teamid = teamID merge m:m teamid yearID using "C:\Users\dmk38\Documents\x5\team_field_composite.dta" drop _merge ///variables for bip_rba * Generate 7 run-environment-period models bysort playerID (yearID): replace bip_comb_minus1 = bip_comb[_n-1] if playerID == playerID[_n-1] bysort playerID (yearID): replace bip_comb_plus1 = bip_comb[_n+1] if playerID == playerID[_n-1] gen period = . replace period = 1912 if yearID < 1920 replace period = 1920 if yearID >= 1920 & yearID < 1942 replace period = 1942 if yearID >= 1942 & yearID < 1963 replace period = 1963 if yearID >= 1963 & yearID < 1977 replace period = 1977 if yearID >= 1977 & yearID < 1993 replace period = 1993 if yearID >= 1993 & yearID < 2010 replace period = 2010 if yearID >= 2010 foreach p in 1912 1920 1942 1963 1977 1993 2010 { di "model period" `p' reg rapg fipr field_composite ofair_pip ifpop_pip [aweight=IP] if period==`p' gen beta_ofair_`p' = _b[ofair_pip] if period==`p' gen beta_ifpop_`p' = _b[ifpop_pip] if period==`p' mean ofair_pip ifpop_pip [aweight=IP] if period==`p' gen mean_ofair_`p' = e(b)[1,1] if period==`p' gen mean_ifpop_`p' = e(b)[1,2] if period==`p' } gen beta_ofair = . gen beta_ifpop = . gen mean_ofair = . gen mean_ifpop = . foreach p in 1912 1920 1942 1963 1977 1993 2010 { replace beta_ofair = beta_ofair_`p' if period==`p' replace beta_ifpop = beta_ifpop_`p' if period==`p' replace mean_ofair = mean_ofair_`p' if period==`p' replace mean_ifpop = mean_ifpop_`p' if period==`p' drop beta_ofair_`p' beta_ifpop_`p' mean_ofair_`p' mean_ifpop_`p' } // bip_rba gen bip_rba=-1*(beta_ofair*(ofair_pip-mean_ofair))+beta_ifpop*(mean_ifpop-ifpop_pip) // impact bip_rba on obabip gen obabip_suppress=. foreach p in 1920 1942 1963 1977 1993 2010 { di as text "{yellow}period `p'{reset}" regress obabip field_composite bip_rba [aweight =IP] if period==`p' replace obabip_suppress=_b[bip_rba] if period==`p' } /// filter for IP; don't execute if want all at >= 100 IP drop if IP < 154 & year <1961 drop if IP < 162 & year >1960 *** BIP_RBA reliability *** BIP_RBA reliability * Step 1: Sort the data by playerid and yearID sort playerID yearID * Step 2: Generate season -1 & season +1 variables gen bip_rba_minus1 = . gen bip_rba_plus1 = . bysort playerID (yearID): replace bip_rba_minus1 = bip_rba[_n-1] if playerID == playerID[_n-1] bysort playerID (yearID): replace bip_rba_plus1 = bip_rba[_n+1] if playerID == playerID[_n-1] alpha (bip_rba_minus1 bip_rba_plus1 bip_rba) ** export player scores drop if nameLast=="" egen r_bip_rba = rank(bip_rba) gen sample_percentile= 100 * (r_bip_rba - 1) / (_N - 1) export excel nameFirst nameLast yearID IP bip_rba kb_plus sample_percentile using "bip_rba_rps_periods2_plusknuckles.xls", firstrow(variables) replace *** likelihood of knuckleball ps being in 80th and 90th percentiles gen p80=0 replace p80=1 if sample_percentile >=80 logit p80 kb_plus [pweight =IP] predict phat su phat if kb_plus==1,d gen p90=0 replace p90=1 if sample_percentile>=90 su phat if kb_plus==1 logit p90 kb_plus [pweight=IP] predict phat2 sum phat2 if kb_plus==1 export excel nameFirst nameLast yearID bip_rba kb_plus sample_percentile obabip_suppress using "bip_rba.xlsx", firstrow(variables) replace