clear clear matrix set matsize 800 set mem 500m cd "C:\Users\dmk38\Documents\x5\" use lahman pitching data ** back out hitter ABs replace BAOpp = . if BAOpp == 9.99 gen abf= H/BAOpp drop if abf==. // sum data per pitcher collapse (sum) W L G GS CG SHO SV H ER HR BB SO IBB WP HBP BK BFP GF R GIDP IPouts abf, by(playerID yearID teamID) // IP gen IP=IPouts/3 // add Lahman names key merge m:m playerID using lahman names,update drop _merge // add bbref data on SH/SF for opponent BABIP--not really necessary here merge m:m bbrefID yearID using bb ref sh sf,update drop _merge drop if yearID==. // merge retrosheet data for post 1999-pitchers merge m:m playerID yearID using rs_bip_2000_20024, update drop _merge //merge FG BIS data on BIPs merge m:m bbrefID yearID using fg_bis_bip drop _merge drop if year<2002 // merege BBref data for 2024 merge m:m bbrefID yearID using bbef_2024_patch,update drop _merge drop if yearID==. drop if IP<100 drop if IP ==. *** determine performance values as rates based on Project Playsheet companion gen gb_pct_rs2 =GB/(GB+FB+LD+POP) gen k9=9*SO/IP gen bb9=9*BB/IP gen hr9=9*HR/IP gen hr_pip=HR/IP gen k_pip=SO/IP gen bb_pip=SO/IP gen hbp_pip=HBP/IP drop if IP <100 gen rapg=9*R/IP gen obabip=(H-HR)/(abf-(SO+HR)+SF) gen gb_pct= gb/(gb+airb) gen ifgb_pip=ifgb/IP gen ifpop_pip=ifpop/IP gen airb_pip=airb/IP gen ofair_pip=ofair/IP gen bobw= ifpop_pip-ofair_pip gen gb_pct2= gb2/(gb2+airb2) gen ifgb_pip2=ifgb2/IP gen ifpop_pip2=ifpop2/IP gen airb_pip2=airb2/IP gen ofair_pip2=ofair2/IP gen bobw2= ifpop_pip2-ofair_pip2 gen ifair_pip=ifair/IP gen ifair_pip2=ifair/IP gen fg_air=fg_fb+fg_ld // creates pct of BIP either line drive or flyball //generate FIPr for each pitcher regress rapg k_pip bb_pip hr_pip hbp_pip [aweight=IP] predict fipr *********************************** * Population-Weighted Variance Code *********************************** * 1. Summarize IP within each year egen total_IP = sum(IP), by(year) * 2. Compute weighted mean of fipr generate double weighted_fipr_product = fipr * IP egen double total_weighted_fipr = sum(weighted_fipr_product), by(year) generate double weighted_mean_fipr = total_weighted_fipr / total_IP * 3. Compute weighted sum of squared deviations * (fipr - weighted_mean_fipr)^2 * IP generate double wssd = (fipr - weighted_mean_fipr)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_fipr = total_wssd / total_IP * 5. Population weighted standard deviation generate double pfiprd_fipr = sqrt(popvar_fipr) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_fipr_product total_weighted_fipr rename weighted_mean_fipr wvar_fipr rename pfiprd_fipr wsd_fipr drop total_wssd *** zscore gen z_fipr = (fipr-wvar_fipr)/wsd_fipr * 2. Compute weighted mean of gb_pct generate double weighted_gb_pct_product = gb_pct * IP egen double total_weighted_gb_pct = sum(weighted_gb_pct_product), by(year) generate double weighted_mean_gb_pct = total_weighted_gb_pct / total_IP * 3. Compute weighted sum of squared deviations * (gb_pct - weighted_mean_gb_pct)^2 * IP generate double wssd = (gb_pct - weighted_mean_gb_pct)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_gb_pct = total_wssd / total_IP * 5. Population weighted standard deviation generate double pgb_pctd_gb_pct = sqrt(popvar_gb_pct) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_gb_pct_product total_weighted_gb_pct rename weighted_mean_gb_pct wvar_gb_pct rename pgb_pctd_gb_pct wsd_gb_pct drop total_wssd *** zscore gen z_gb_pct = (gb_pct-wvar_gb_pct)/wsd_gb_pct * 2. Compute weighted mean of gb_pct_rs2 generate double weighted_gb_pct_rs2_product = gb_pct_rs2 * IP egen double total_weighted_gb_pct_rs2 = sum(weighted_gb_pct_rs2_product), by(year) generate double weighted_mean_gb_pct_rs2 = total_weighted_gb_pct_rs2 / total_IP * 3. Compute weighted sum of squared deviations * (gb_pct_rs2 - weighted_mean_gb_pct_rs2)^2 * IP generate double wssd = (gb_pct_rs2 - weighted_mean_gb_pct_rs2)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_gb_pct_rs2 = total_wssd / total_IP * 5. Population weighted standard deviation generate double pgb_pct_rs2d_gb_pct_rs2 = sqrt(popvar_gb_pct_rs2) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_gb_pct_rs2_product total_weighted_gb_pct_rs2 rename weighted_mean_gb_pct_rs2 wvar_gb_pct_rs2 rename pgb_pct_rs2d_gb_pct_rs2 wsd_gb_pct_rs2 drop total_wssd *** zscore gen z_gb_pct_rs2 = (gb_pct_rs2-wvar_gb_pct_rs2)/wsd_gb_pct_rs2 * 2. Compute weighted mean of fg_air generate double weighted_fg_air_product = fg_air * IP egen double total_weighted_fg_air = sum(weighted_fg_air_product), by(year) generate double weighted_mean_fg_air = total_weighted_fg_air / total_IP * 3. Compute weighted sum of squared deviations * (fg_air - weighted_mean_fg_air)^2 * IP generate double wssd = (fg_air - weighted_mean_fg_air)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_fg_air = total_wssd / total_IP * 5. Population weighted standard deviation generate double pfg_aird_fg_air = sqrt(popvar_fg_air) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_fg_air_product total_weighted_fg_air rename weighted_mean_fg_air wvar_fg_air rename pfg_aird_fg_air wsd_fg_air drop total_wssd *** zscore gen z_fg_air = (fg_air-wvar_fg_air)/wsd_fg_air * 2. Compute weighted mean of fg_ld generate double weighted_fg_ld_product = fg_ld * IP egen double total_weighted_fg_ld = sum(weighted_fg_ld_product), by(year) generate double weighted_mean_fg_ld = total_weighted_fg_ld / total_IP * 3. Compute weighted sum of squared deviations * (fg_ld - weighted_mean_fg_ld)^2 * IP generate double wssd = (fg_ld - weighted_mean_fg_ld)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_fg_ld = total_wssd / total_IP * 5. Population weighted standard deviation generate double pfg_ldd_fg_ld = sqrt(popvar_fg_ld) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_fg_ld_product total_weighted_fg_ld rename weighted_mean_fg_ld wvar_fg_ld rename pfg_ldd_fg_ld wsd_fg_ld drop total_wssd *** zscore gen z_fg_ld = (fg_ld-wvar_fg_ld)/wsd_fg_ld * 2. Compute weighted mean of ifair_pip generate double weighted_ifair_pip_product = ifair_pip * IP egen double total_weighted_ifair_pip = sum(weighted_ifair_pip_product), by(year) generate double weighted_mean_ifair_pip = total_weighted_ifair_pip / total_IP * 3. Compute weighted sum of squared deviations * (ifair_pip - weighted_mean_ifair_pip)^2 * IP generate double wssd = (ifair_pip - weighted_mean_ifair_pip)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_ifair_pip = total_wssd / total_IP * 5. Population weighted standard deviation generate double pifair_pipd_ifair_pip = sqrt(popvar_ifair_pip) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_ifair_pip_product total_weighted_ifair_pip rename weighted_mean_ifair_pip wvar_ifair_pip rename pifair_pipd_ifair_pip wsd_ifair_pip drop total_wssd *** zscore gen z_ifair_pip = (ifair_pip-wvar_ifair_pip)/wsd_ifair_pip * 2. Compute weighted mean of rapg generate double weighted_rapg_product = rapg * IP egen double total_weighted_rapg = sum(weighted_rapg_product), by(year) generate double weighted_mean_rapg = total_weighted_rapg / total_IP * 3. Compute weighted sum of squared deviations * (rapg - weighted_mean_rapg)^2 * IP generate double wssd = (rapg - weighted_mean_rapg)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_rapg = total_wssd / total_IP * 5. Population weighted standard deviation generate double prapgd_rapg = sqrt(popvar_rapg) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_rapg_product total_weighted_rapg rename weighted_mean_rapg wvar_rapg rename prapgd_rapg wsd_rapg drop total_wssd *** zscore gen z_rapg = (rapg-wvar_rapg)/wsd_rapg * 2. Compute weighted mean of fg_gbfb generate double weighted_fg_gbfb_product = fg_gbfb * IP egen double total_weighted_fg_gbfb = sum(weighted_fg_gbfb_product), by(year) generate double weighted_mean_fg_gbfb = total_weighted_fg_gbfb / total_IP * 3. Compute weighted sum of squared deviations * (fg_gbfb - weighted_mean_fg_gbfb)^2 * IP generate double wssd = (fg_gbfb - weighted_mean_fg_gbfb)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_fg_gbfb = total_wssd / total_IP * 5. Population weighted standard deviation generate double pfg_gbfbd_fg_gbfb = sqrt(popvar_fg_gbfb) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_fg_gbfb_product total_weighted_fg_gbfb rename weighted_mean_fg_gbfb wvar_fg_gbfb rename pfg_gbfbd_fg_gbfb wsd_fg_gbfb drop total_wssd *** zscore gen z_fg_gbfb = (fg_gbfb-wvar_fg_gbfb)/wsd_fg_gbfb * 2. Compute weighted mean of fg_gb generate double weighted_fg_gb_product = fg_gb * IP egen double total_weighted_fg_gb = sum(weighted_fg_gb_product), by(year) generate double weighted_mean_fg_gb = total_weighted_fg_gb / total_IP * 3. Compute weighted sum of squared deviations * (fg_gb - weighted_mean_fg_gb)^2 * IP generate double wssd = (fg_gb - weighted_mean_fg_gb)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_fg_gb = total_wssd / total_IP * 5. Population weighted standard deviation generate double pfg_gbd_fg_gb = sqrt(popvar_fg_gb) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_fg_gb_product total_weighted_fg_gb rename weighted_mean_fg_gb wvar_fg_gb rename pfg_gbd_fg_gb wsd_fg_gb drop total_wssd *** zscore gen z_fg_gb = (fg_gb-wvar_fg_gb)/wsd_fg_gb * 2. Compute weighted mean of fg_fb generate double weighted_fg_fb_product = fg_fb * IP egen double total_weighted_fg_fb = sum(weighted_fg_fb_product), by(year) generate double weighted_mean_fg_fb = total_weighted_fg_fb / total_IP * 3. Compute weighted sum of squared deviations * (fg_fb - weighted_mean_fg_fb)^2 * IP generate double wssd = (fg_fb - weighted_mean_fg_fb)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_fg_fb = total_wssd / total_IP * 5. Population weighted standard deviation generate double pfg_fbd_fg_fb = sqrt(popvar_fg_fb) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_fg_fb_product total_weighted_fg_fb rename weighted_mean_fg_fb wvar_fg_fb rename pfg_fbd_fg_fb wsd_fg_fb drop total_wssd *** zscore gen z_fg_fb = (fg_fb-wvar_fg_fb)/wsd_fg_fb * 2. Compute weighted mean of ifpop_pip generate double weighted_ifpop_pip_product = ifpop_pip * IP egen double total_weighted_ifpop_pip = sum(weighted_ifpop_pip_product), by(year) generate double weighted_mean_ifpop_pip = total_weighted_ifpop_pip / total_IP * 3. Compute weighted sum of squared deviations * (ifpop_pip - weighted_mean_ifpop_pip)^2 * IP generate double wssd = (ifpop_pip - weighted_mean_ifpop_pip)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_ifpop_pip = total_wssd / total_IP * 5. Population weighted standard deviation generate double pifpop_pipd_ifpop_pip = sqrt(popvar_ifpop_pip) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_ifpop_pip_product total_weighted_ifpop_pip rename weighted_mean_ifpop_pip wvar_ifpop_pip rename pifpop_pipd_ifpop_pip wsd_ifpop_pip drop total_wssd gen z_ifpop_pip = (ifpop_pip-wvar_ifpop_pip)/wsd_ifpop_pip * 2. Compute weighted mean of fg_iffb generate double weighted_fg_iffb_product = fg_iffb * IP egen double total_weighted_fg_iffb = sum(weighted_fg_iffb_product), by(year) generate double weighted_mean_fg_iffb = total_weighted_fg_iffb / total_IP * 3. Compute weighted sum of squared deviations * (fg_iffb - weighted_mean_fg_iffb)^2 * IP generate double wssd = (fg_iffb - weighted_mean_fg_iffb)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_fg_iffb = total_wssd / total_IP * 5. Population weighted standard deviation generate double pfg_iffbd_fg_iffb = sqrt(popvar_fg_iffb) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_fg_iffb_product total_weighted_fg_iffb rename weighted_mean_fg_iffb wvar_fg_iffb rename pfg_iffbd_fg_iffb wsd_fg_iffb drop total_wssd *** zscore gen z_fg_iffb = (fg_iffb-wvar_fg_iffb)/wsd_fg_iffb * 2. Compute weighted mean of hr_pip generate double weighted_hr_pip_product = hr_pip * IP egen double total_weighted_hr_pip = sum(weighted_hr_pip_product), by(year) generate double weighted_mean_hr_pip = total_weighted_hr_pip / total_IP * 3. Compute weighted sum of squared deviations * (hr_pip - weighted_mean_hr_pip)^2 * IP generate double wssd = (hr_pip - weighted_mean_hr_pip)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_hr_pip = total_wssd / total_IP * 5. Population weighted standard deviation generate double phr_pipd_hr_pip = sqrt(popvar_hr_pip) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_hr_pip_product total_weighted_hr_pip rename weighted_mean_hr_pip wvar_hr_pip rename phr_pipd_hr_pip wsd_hr_pip drop total_wssd *** zscore gen z_hr_pip = (hr_pip-wvar_hr_pip)/wsd_hr_pip * 2. Compute weighted mean of ofair_pip generate double weighted_ofair_pip_product = ofair_pip * IP egen double total_weighted_ofair_pip = sum(weighted_ofair_pip_product), by(year) generate double weighted_mean_ofair_pip = total_weighted_ofair_pip / total_IP * 3. Compute weighted sum of squared deviations * (ofair_pip - weighted_mean_ofair_pip)^2 * IP generate double wssd = (ofair_pip - weighted_mean_ofair_pip)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_ofair_pip = total_wssd / total_IP * 5. Population weighted standard deviation generate double pofair_pipd_ofair_pip = sqrt(popvar_ofair_pip) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_ofair_pip_product total_weighted_ofair_pip rename weighted_mean_ofair_pip wvar_ofair_pip rename pofair_pipd_ofair_pip wsd_ofair_pip drop total_wssd *** zscore gen z_ofair_pip = (ofair_pip-wvar_ofair_pip)/wsd_ofair_pip * 2. Compute weighted mean of ifgb_pip generate double weighted_ifgb_pip_product = ifgb_pip * IP egen double total_weighted_ifgb_pip = sum(weighted_ifgb_pip_product), by(year) generate double weighted_mean_ifgb_pip = total_weighted_ifgb_pip / total_IP * 3. Compute weighted sum of squared deviations * (ifgb_pip - weighted_mean_ifgb_pip)^2 * IP generate double wssd = (ifgb_pip - weighted_mean_ifgb_pip)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_ifgb_pip = total_wssd / total_IP * 5. Population weighted standard deviation generate double pifgb_pipd_ifgb_pip = sqrt(popvar_ifgb_pip) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_ifgb_pip_product total_weighted_ifgb_pip rename weighted_mean_ifgb_pip wvar_ifgb_pip rename pifgb_pipd_ifgb_pip wsd_ifgb_pip drop total_wssd *** zscore gen z_ifgb_pip = (ifgb_pip-wvar_ifgb_pip)/wsd_ifgb_pip * 2. Compute weighted mean of ofair_pip2 generate double weighted_ofair_pip2_product = ofair_pip2 * IP egen double total_weighted_ofair_pip2 = sum(weighted_ofair_pip2_product), by(year) generate double weighted_mean_ofair_pip2 = total_weighted_ofair_pip2 / total_IP * 3. Compute weighted sum of squared deviations * (ofair_pip2 - weighted_mean_ofair_pip2)^2 * IP generate double wssd = (ofair_pip2 - weighted_mean_ofair_pip2)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_ofair_pip2 = total_wssd / total_IP * 5. Population weighted standard deviation generate double pofair_pip2d_ofair_pip2 = sqrt(popvar_ofair_pip2) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_ofair_pip2_product total_weighted_ofair_pip2 rename weighted_mean_ofair_pip2 wvar_ofair_pip2 rename pofair_pip2d_ofair_pip2 wsd_ofair_pip2 drop total_wssd *** zscore gen z_ofair_pip2 = (ofair_pip2-wvar_ofair_pip2)/wsd_ofair_pip2 * 2. Compute weighted mean of ifair_pip2 generate double weighted_ifair_pip2_product = ifair_pip2 * IP egen double total_weighted_ifair_pip2 = sum(weighted_ifair_pip2_product), by(year) generate double weighted_mean_ifair_pip2 = total_weighted_ifair_pip2 / total_IP * 3. Compute weighted sum of squared deviations * (ifair_pip2 - weighted_mean_ifair_pip2)^2 * IP generate double wssd = (ifair_pip2 - weighted_mean_ifair_pip2)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_ifair_pip2 = total_wssd / total_IP * 5. Population weighted standard deviation generate double pifair_pip2d_ifair_pip2 = sqrt(popvar_ifair_pip2) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_ifair_pip2_product total_weighted_ifair_pip2 rename weighted_mean_ifair_pip2 wvar_ifair_pip2 rename pifair_pip2d_ifair_pip2 wsd_ifair_pip2 drop total_wssd *** zscore gen z_ifair_pip2 = (ifair_pip2-wvar_ifair_pip2)/wsd_ifair_pip2 * 2. Compute weighted mean of bobw generate double weighted_bobw_product = bobw * IP egen double total_weighted_bobw = sum(weighted_bobw_product), by(year) generate double weighted_mean_bobw = total_weighted_bobw / total_IP * 3. Compute weighted sum of squared deviations * (bobw - weighted_mean_bobw)^2 * IP generate double wssd = (bobw - weighted_mean_bobw)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_bobw = total_wssd / total_IP * 5. Population weighted standard deviation generate double pbobwd_bobw = sqrt(popvar_bobw) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_bobw_product total_weighted_bobw rename weighted_mean_bobw wvar_bobw rename pbobwd_bobw wsd_bobw drop total_wssd *** zscore gen z_bobw = (bobw-wvar_bobw)/wsd_bobw * 2. Compute weighted mean of airb_pip2 generate double weighted_airb_pip2_product = airb_pip2 * IP egen double total_weighted_airb_pip2 = sum(weighted_airb_pip2_product), by(year) generate double weighted_mean_airb_pip2 = total_weighted_airb_pip2 / total_IP * 3. Compute weighted sum of squared deviations * (airb_pip2 - weighted_mean_airb_pip2)^2 * IP generate double wssd = (airb_pip2 - weighted_mean_airb_pip2)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_airb_pip2 = total_wssd / total_IP * 5. Population weighted standard deviation generate double pairb_pip2d_airb_pip2 = sqrt(popvar_airb_pip2) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_airb_pip2_product total_weighted_airb_pip2 rename weighted_mean_airb_pip2 wvar_airb_pip2 rename pairb_pip2d_airb_pip2 wsd_airb_pip2 drop total_wssd *** zscore gen z_airb_pip2 = (airb_pip2-wvar_airb_pip2)/wsd_airb_pip2 * 2. Compute weighted mean of ifgb_pip2 generate double weighted_ifgb_pip2_product = ifgb_pip2 * IP egen double total_weighted_ifgb_pip2 = sum(weighted_ifgb_pip2_product), by(year) generate double weighted_mean_ifgb_pip2 = total_weighted_ifgb_pip2 / total_IP * 3. Compute weighted sum of squared deviations * (ifgb_pip2 - weighted_mean_ifgb_pip2)^2 * IP generate double wssd = (ifgb_pip2 - weighted_mean_ifgb_pip2)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_ifgb_pip2 = total_wssd / total_IP * 5. Population weighted standard deviation generate double pifgb_pip2d_ifgb_pip2 = sqrt(popvar_ifgb_pip2) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_ifgb_pip2_product total_weighted_ifgb_pip2 rename weighted_mean_ifgb_pip2 wvar_ifgb_pip2 rename pifgb_pip2d_ifgb_pip2 wsd_ifgb_pip2 drop total_wssd *** zscore gen z_ifgb_pip2 = (ifgb_pip2-wvar_ifgb_pip2)/wsd_ifgb_pip2 * 2. Compute weighted mean of ifpop_pip2 generate double weighted_ifpop_pip2_product = ifpop_pip2 * IP egen double total_weighted_ifpop_pip2 = sum(weighted_ifpop_pip2_product), by(year) generate double weighted_mean_ifpop_pip2 = total_weighted_ifpop_pip2 / total_IP * 3. Compute weighted sum of squared deviations * (ifpop_pip2 - weighted_mean_ifpop_pip2)^2 * IP generate double wssd = (ifpop_pip2 - weighted_mean_ifpop_pip2)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_ifpop_pip2 = total_wssd / total_IP * 5. Population weighted standard deviation generate double pifpop_pip2d_ifpop_pip2 = sqrt(popvar_ifpop_pip2) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_ifpop_pip2_product total_weighted_ifpop_pip2 rename weighted_mean_ifpop_pip2 wvar_ifpop_pip2 rename pifpop_pip2d_ifpop_pip2 wsd_ifpop_pip2 drop total_wssd gen z_ifpop_pip2 = (ifpop_pip2-wvar_ifpop_pip2)/wsd_ifpop_pip2 * 2. Compute weighted mean of obabip generate double weighted_obabip_product = obabip * IP egen double total_weighted_obabip = sum(weighted_obabip_product), by(year) generate double weighted_mean_obabip = total_weighted_obabip / total_IP * 3. Compute weighted sum of squared deviations * (obabip - weighted_mean_obabip)^2 * IP generate double wssd = (obabip - weighted_mean_obabip)^2 * IP egen double total_wssd = sum(wssd), by(year) * 4. population variance: divide by total weight generate double popvar_obabip = total_wssd / total_IP * 5. Population weighted standard deviation generate double pobabipd_obabip = sqrt(popvar_obabip) * 6. [Optional] Clean up intermediate variIPles drop wssd weighted_obabip_product total_weighted_obabip rename weighted_mean_obabip wvar_obabip rename pobabipd_obabip wsd_obabip drop total_wssd gen z_obabip = (obabip-wvar_obabip)/wsd_obabip // drop outliers // Compute quartiles and IQR summarize z_fg_air, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_fg_air = . if z_fg_air < lower_bound | z_fg_air > upper_bound // Compute quartiles and IQR summarize z_ifair_pip, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_ifair_pip = . if z_ifair_pip < lower_bound | z_ifair_pip > upper_bound // Compute quartiles and IQR summarize z_ofair_pip, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_ofair_pip = . if z_ofair_pip < lower_bound | z_ofair_pip > upper_bound // Compute quartiles and IQR summarize z_bobw, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_bobw = . if z_bobw < lower_bound | z_bobw > upper_bound // Compute quartiles and IQR summarize z_ifair_pip2, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_ifair_pip2 = . if z_ifair_pip2 < lower_bound | z_ifair_pip2 > upper_bound // Compute quartiles and IQR summarize z_ofair_pip2, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_ofair_pip2 = . if z_ofair_pip2 < lower_bound | z_ofair_pip2 > upper_bound // Compute quartiles and IQR summarize z_bobw, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_bobw = . if z_bobw < lower_bound | z_bobw > upper_bound // Compute quartiles and IQR summarize z_airb_pip2, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_airb_pip2 = . if z_airb_pip2 < lower_bound | z_airb_pip2 > upper_bound // Compute quartiles and IQR summarize z_ifpop_pip2, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_ifpop_pip2 = . if z_ifpop_pip2 < lower_bound | z_ifpop_pip2 > upper_bound zscore z_ifpop_pip2 replace z_ifpop_pip2=z_z_ifpop_pip2 // Compute quartiles and IQR summarize z_ifgb_pip2, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_ifgb_pip2 = . if z_ifgb_pip2 < lower_bound | z_ifgb_pip2 > upper_bound zscore z_ifgb_pip2 replace z_ifgb_pip2=z_z_ifgb_pip2 // Compute quartiles and IQR summarize z_airb_pip, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_airb_pip = . if z_airb_pip < lower_bound | z_airb_pip > upper_bound // Compute quartiles and IQR summarize z_obabip, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_obabip = . if z_obabip < lower_bound | z_obabip > upper_bound zscore z_obabip replace z_obabip=z_z_obabip // Compute quartiles and IQR summarize z_ifpop_pip, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values // replace z_ifpop_pip = . if z_ifpop_pip < lower_bound | z_ifpop_pip > upper_bound zscore z_ifpop_pip replace z_ifpop_pip=z_z_ifpop_pip // Compute quartiles and IQR summarize z_ifgb_pip, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_ifgb_pip = . if z_ifgb_pip < lower_bound | z_ifgb_pip > upper_bound zscore z_ifgb_pip replace z_ifgb_pip=z_z_ifgb_pip // Compute quartiles and IQR summarize z_gb_pct_rs2, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Compute quartiles and IQR summarize z_ifair_pip, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_ifair_pip = . if z_ifair_pip < lower_bound | z_ifair_pip > upper_bound // Compute quartiles and IQR summarize z_fipr, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_fipr = . if z_fipr < lower_bound | z_fipr > upper_bound zscore z_fipr replace z_fipr=z_z_fipr // Compute quartiles and IQR summarize z_fg_fb, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_fg_fb = . if z_fg_fb < lower_bound | z_fg_fb > upper_bound zscore z_fg_fb replace z_fg_fb=z_z_fg_fb // Compute quartiles and IQR summarize z_fg_gb, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_fg_gb = . if z_fg_gb < lower_bound | z_fg_gb > upper_bound zscore z_fg_gb replace z_fg_gb=z_z_fg_gb // Compute quartiles and IQR summarize z_fg_iffb, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_fg_iffb = . if z_fg_iffb < lower_bound | z_fg_iffb > upper_bound zscore z_fg_iffb replace z_fg_iffb=z_z_fg_iffb // Compute quartiles and IQR summarize z_fg_gbfb, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_fg_gbfb = . if z_fg_gbfb < lower_bound | z_fg_gbfb > upper_bound zscore z_fg_gbfb replace z_fg_gbfb=z_z_fg_gbfb // Compute quartiles and IQR summarize z_fg_ld, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_fg_ld = . if z_fg_ld < lower_bound | z_fg_ld > upper_bound zscore z_fg_ld replace z_fg_ld=z_z_fg_ld // Compute quartiles and IQR summarize z_rapg, detail scalar iqr = r(p75) - r(p25) scalar lower_bound = r(p25) - 1.5 * iqr scalar upper_bound = r(p75) + 1.5 * iqr // Replace outliers with missing values replace z_rapg = . if z_rapg < lower_bound | z_rapg > upper_bound zscore z_rapg replace z_rapg=z_z_rapg // merge team fielding--TZR for 2002, DER for 2002-2024 gen teamid = teamID merge m:m teamid yearID using fielding_composite data drop _merge /// consitency * Step 1: Sort the data by playerid and yearID sort playerID yearID * Step 2: Generate season -1 & season +1 variables gen fg_air_minus1 = . gen fg_air_plus1 = . gen ifair_pip_minus1 = . gen ifair_pip_plus1 = . gen fg_ld_minus1 = . gen fg_ld_plus1 = . gen fg_gb_minus1 = . gen fg_gb_plus1 = . gen fipr_minus1 = . gen fipr_plus1 = . gen fg_gbfb_minus1 = . gen fg_gbfb_plus1 = . gen fg_fb_minus1 = . gen fg_fb_plus1 = . gen fg_iffb_minus1 = . gen fg_iffb_plus1 = . * Step 3: Replace missing values for season -1 and season plus 1 bysort playerID (yearID): replace ifair_pip_minus1 = ifair_pip[_n-1] if playerID == playerID[_n-1] bysort playerID (yearID): replace ifair_pip_plus1 = ifair_pip[_n+1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fipr_minus1 = fipr[_n-1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fipr_plus1 = fipr[_n+1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_gbfb_minus1 = fg_gbfb[_n-1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_gbfb_plus1 = fg_gbfb[_n+1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_ld_minus1 = fg_ld[_n-1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_ld_plus1 = fg_ld[_n+1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_gb_minus1 = fg_gb[_n-1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_gb_plus1 = fg_gb[_n+1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_fb_minus1 = fg_fb[_n-1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_fb_plus1 = fg_fb[_n+1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_iffb_minus1 = fg_iffb[_n-1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_iffb_plus1 = fg_iffb[_n+1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_air_minus1 = fg_air[_n-1] if playerID == playerID[_n-1] bysort playerID (yearID): replace fg_air_plus1 = fg_air[_n+1] if playerID == playerID[_n-1] /// alphas for BIS data alpha (fg_air fg_air_minus1 fg_air_plus1) alpha (fg_gbfb fg_gbfb_minus1 fg_gbfb_plus1) /// regressions for non-inf grounders balls in play, weighted for IP regress z_rapg z_fg_gbfb [aweight=IP] regress z_rapg z_fg_air [aweight=IP] regress z_rapg z_fipr z_field_composite [aweight=IP] regress z_rapg z_fipr z_field_composite z_fg_air [aweight=IP] // wrong sign for BIS line drive & flyball regress z_rapg z_fipr z_field_composite z_fg_gbfb [aweight=IP] // wrong sign for BIS ground ball rate!!!! regress z_rapg z_ofair_pip [aweight=IP] regress z_rapg z_fipr z_field_composite z_ofair_pip [aweight=IP] // still correct sign for retrosheet BIP in air to OF /// plotting the zero order & residguals // Set seed for reproducibility set seed 1238885 // Generate random uniform variable for sampling generate random = runiform() // First get residuals from fipr model using full sample regress rapg fipr field_composite [pweight=IP] predict resid_fipr_fc, residuals // Create percentiles of absolute residuals for color gradients // Still using full sample to define the gradients generate abs_resid = abs(resid_fipr) // trim to random 20% subsample to avoid overplotting // Plot 0 twoway (scatter rapg fg_air if random <=0.2, mcolor(blue) xlabel(.3(.1).8,nogrid) xscale(range(.3 .8)) ylabel(,nogrid) mcolor(blue%20) ) /// (lfit rapg fg_air if random <=0.2, lcolor(black)), /// legend(off) /// title("Runs per game on LD+FB") /// ytitle("runs per game") xtitle("LD+FB") name(plot0, replace) // Plot 1 twoway (scatter rapg fipr if random <=0.2, mcolor(blue) ylabel(,nogrid) mcolor(blue%20) ) /// (lfit rapg fipr if random <= 0.2, lcolor(black) xlabel(,nogrid) mcolor(blue%40) ylabel(,nogrid)), /// legend(off) /// title("Runs per game on FIPr") /// ytitle("runs per game") xtitle("FIPr") name(plot1, replace) // Plot 2 twoway (scatter resid_fipr_fc fg_air if random <=0.2, mcolor(blue) xlabel(.3(.1).8,nogrid) xscale(range(.3 .8)) ylabel(,nogrid) mcolor(blue%20) ) /// (lfit resid_fipr_fc fg_air if random <= 0.2, lcolor(black) xlabel(,nogrid) ylabel(,nogrid)), /// legend(off) /// title("Residuals on LD+FB") /// ytitle("Residuals") xtitle("LD+FB") name(plot2, replace) // Plot 1 twoway (scatter rapg fipr if random <=0.2, mcolor(blue) ylabel(,nogrid) mcolor(blue%40) ) /// (lfit rapg fipr if random <= 0.2, lcolor(black) xlabel(,nogrid) mcolor(blue%40) ylabel(,nogrid)), /// legend(off) /// title("Runs per game on FIPr") /// ytitle("runs per game") xtitle("FIPr") name(plot1, replace) // Plot 2 twoway (scatter resid_fipr_fc fg_gbfb if random <=0.2, mcolor(blue%40) xlabel(,nogrid) ylabel(,nogrid) ) (lfit resid_fipr_fc fg_gbfb if random <=0.2, lcolor(black)), /// title("Residuals vs GB/FB") /// ytitle("Residuals") xtitle("GB/FB") name(plot2, replace) legend(off) /// plot 0 twoway (scatter rapg ofair_pip if random <=0.2, xlabel(0(.5)2,nogrid) xscale(range(0 1)) ylabel(,nogrid) mcolor(blue%40) ) (lfit rapg ofair_pip if random <=0.2, lcolor(black)), /// legend(off) /// title("Zero order relationship") /// ytitle("rapg") xtitle("Retrosheet outfield BIP ") name(plot0, replace) // Plot 2 twoway (scatter resid_fipr_fc ofair_pip if random <=0.2, mcolor(blue%40) xlabel(,nogrid) ylabel(,nogrid) ) (lfit resid_fipr_fc ofair_pip if random <=0.2, lcolor(black)), /// title("Residuals vs Retrosheet outfield BIP, same 20% sample") /// ytitle("Residuals") xtitle("Retrosheet outfield BIP ") name(plot2, replace) legend(off)