*****************************************************************************************************************************************************************; 
*	Last Update: June 2015																	*;
*																				*;
* 	This SAS code reproduces our main results							.							*;
*	Note, this is NOT the code used in the actual paper. My 'old' computer died unexpectedly and I had to re-write the code.				*;
*  	I made some minor changes, which I consider improvements. So the numbers generated in this code will not exactly match the ones used in the paper.	*;
*****************************************************************************************************************************************************************; 


libname crsp 	'C:\...\data\crsp';										*this folder contains three SAS datasets (dsf - daily stock returns file, dse - daily stock events file from CRSP, and dsix - daily market returns file from CRSP);
libname split	'C:\...\split';											*this folder contains no SAS datasets in the beginning;
	
**********************************************************
 Part 1 - Create file with splits
**********************************************************;
data dse; set crsp.dse; by permno date;						
	if DISTCD = 5523 then split=1; else split=0; 								*identify splits as in Baker, Greenwood, Wurgler (2009, "Catering through nominal share prices");
	keep permno date DISTCD split DCLRDT RCRDDT PAYDT; 
run;
*note to myself, there's sometimes multiple entries by date in dse, i.e. two rows or even more for the same date;
*scrolling through the data, it appears that generally the first row contains some distribution code; 
*the second row (and higher) just contains .;

data CFACPR; set crsp.dsf; by permno date;
	prc = abs(prc);
	keep permno date CFACPR prc;
data CFACPR; set CFACPR; by permno date;
	if first.permno then n=0;
	n+1;

	lag1_prc	=	lag(prc);
	if first.permno then lag1_prc = .;

	abc		=   	CFACPR;
	lag1_CFACPR	=	lag(CFACPR);
	if first.permno then lag1_CFACPR = .;
	if lag1_CFACPR > 0 then delta_CFACPR = (CFACPR-lag1_CFACPR)/lag1_CFACPR; 
		else delta_CFACPR = .;
	if CFACPR > 0 then delta_CFACPR = delta_CFACPR; 
		else delta_CFACPR = .;

	keep permno date prc lag1_prc abc lag1_CFACPR delta_CFACPR;
data dse; merge dse CFACPR; by permno date;
	rename abc = CFACPR;
run;

data split.split; set dse;
	label date 		= 'Date (time t)';
	label prc  		= 'Price at time t';
	label lag1_prc  	= 'Price at time t-1 (One Trading Day Ago)';
	label CFACPR		= 'Cumulative Factor to Adjust Prices at time t';
	label lag1_CFACPR	= 'Cumulative Factor to Adjust Prices at time t-1';
	label delta_CFACPR 	= 'Split Ratio as Inferred by Change in Cumulative Factor to Adjust Prices';
	if split = 1;
	drop split;
run;

data split.index_daily; set crsp.dsix; by date; 								*assign index to each date;
	n+1;
	keep date n;
run;

data split.split_firms; set split.split;
	if -0.501 <= delta_CFACPR <= -0.499;									*focus on 2-for-1 splits, account for rounding issues;

*	if PAYDT ne .;
	label date = 'Date Split becomes reflected in Price in CRSP (time t)';

	b=round(lag1_prc);
	b1=(b/2)-((b-b/2)/2); 				
	b2=(b/2)+((b-b/2)/2); 											*b1 to b2 makes the low price range, b2 to b3 makes the high price range;
	b3=(b)  +((b-b/2)/2);

	keep permno date DCLRDT PAYDT b1 b2 b3 prc lag1_prc;
proc sort data=split.split_firms; by date;
run;

data split_firms; merge split.split_firms (in=m1) split.index_daily; by date;
	if m1;
	rename n = event_index;
run;

data split.split_firms; set split_firms; by event_index;
	n+1;

	beg_index	=	event_index-260;
	end_index	=	event_index+260;

	identifier=n;
	drop n;
run;


*****************************************************************
 Part 2 - Create high price-range and low price-range portfolios
*****************************************************************;

proc sort data=crsp.dsf out=dret; by date;
data dret; merge dret (in=m1) split.index_daily (in=m2); by date;
	if m1 and m2;
proc sort data=dret; by permno date;
data dret; set dret; by permno date;
	prc = abs(prc);
    me = abs(prc)*shrout/1000;
    keep permno date ret prc me n;
data split.dret; set dret; by permno date;
    lag_me = lag(me);
    if first.permno then lag_me=.;
    if ret = .B then ret=.;
    if ret = .C then ret=.;
    keep permno date ret prc lag_me n;
proc sort data=split.dret; by date;
run;

*Splits 1-2000: 	Time index 1-15014		-->base1;
*Splits 2001-4000: 	Time index 14495-17133		-->base2;
*Splits 4001-6000: 	Time index 16615-19559		-->base3;
*Splits 6001-8020: 	Time index 19039-23534		-->base4 (There are 8071 splits. However, the time index for 20141231 is 23535, so split 8020 is the last one with enough data.);

data ret1; set split.dret;
	if 1 <= n <= 15014;								*running the code on various subsamples is a lot more efficient;
	apple=1;
data ret2; set split.dret;
	if 1 <= n <= 15014;								*running the code on various subsamples is a lot more efficient;
	rename permno = permno_split;
	keep permno date n ret;
proc sort data=ret2; by permno_split date;
run;


*delete base dataset if appropriate;
option nonotes;
%macro index_creation;
%do i = 1 %to 10; 																
data tmp; set split.split_firms;
	if identifier = &i;
	apple=1;
	rename permno 	= permno_split;
	rename date   	= EFFDT;
	rename lag1_prc = pre_split_prc; 
	keep permno date DCLRDT PAYDT lag1_prc b1 b2 b3 event_index beg_index end_index identifier apple;
data tmp1; merge tmp ret1; by apple;
	if n < beg_index then delete;
	if n > end_index then delete;
	if b1 <  prc <= b2 then low=1;  else low=0;
	if b2 <  prc <= b3 then high=1; else high=0;
	if lag_me ne .;
	if permno = permno_split then delete;						*kick out the split firm from the respective high price-range and low price-range portfolios to avoid mechanical change in comovement;
	keep permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index identifier permno low high n date ret lag_me;
run;

*low price-range;
data tmp2; set tmp1;
	if low=1;
run;
proc means data=tmp2 noprint;
	var ret /weight=lag_me;
	by permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index identifier n date;
	output out=tmp2 mean=low_vw;
data tmp2; set tmp2;
	drop _TYPE_ _FREQ_;
	label low_ew = 'Low Price (ew)';
	label low_vw = 'Low Price (vw)';
run;

*high price-range;
data tmp3; set tmp1;
	if high=1;
run;
proc means data=tmp3 noprint;
	var ret /weight=lag_me;
	by permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index identifier n date;
	output out=tmp3 mean=high_vw;
data tmp3; set tmp3;
	drop _TYPE_ _FREQ_;
	label high_ew = 'High Price (ew)';
	label high_vw = 'High Price (vw)';
run;

*split;
data tmp4; merge tmp (in=m1) ret2; by permno_split;
	if m1;
	if n < beg_index then delete;
	if n > end_index then delete;
	label pre_split_prc = 'Pre-Split Price';
	label ret = 'Return of Firm doing Split';
	keep permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index identifier n date ret;
run;

data tmp4; merge tmp2 tmp3 tmp4; by permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index identifier n date;
	event_window = n - event_index;
run;

proc append data=tmp4 base=split_base_1;
run;
%end;
%mend;
%index_creation;

data split.split_base_1; set split_base_1;
run;


*****************************************************************
 Part 3 - Compute shifts in price-based return comovement
*****************************************************************;

libname crsp 	'C:\...\data\crsp';
libname split	'C:\...\split';

*append;
proc datasets; delete split_base;
proc append data=split.split_base_1 base=split_base;
proc append data=split.split_base_2 base=split_base;
proc append data=split.split_base_3 base=split_base;
proc append data=split.split_base_4 base=split_base;
run;

*merge in prices and the corresponding price ranges;
data prc; set crsp.dsf;
	prc = abs(prc);
	rename permno = permno_split;
	keep permno date prc;
proc sort data=prc; by permno_split date;
proc sort data=split_base; by permno_split date;
data split_base; merge split_base (in=m1) prc; by permno_split date;
	if m1;
run;

data b; set split.split_firms;
	rename permno = permno_split;
	keep permno identifier b1 b2 b3;
proc sort data=b; by identifier permno_split;
proc sort data=split_base; by identifier permno_split;
data split_base; merge split_base (in=m1) b; by identifier permno_split;
	if m1;
run;

proc sort data=split_base; by identifier permno_split date;
data split.split_base; set split_base;
run;

*create some descriptive stats on price path and when a stock hits the high price range;
proc sort data=split.split_base out=event; by event_window;
proc means data=event noprint;
	var prc b1 b2 b3;
	by event_window;
	output out=event mean=prc b1 b2 b3;
run;


libname split	'C:\...\split';

data zzz; set split.split_base;
	if -240 <  event_window <= -20 	then event = -1;
	if 20 	<= event_window <  240 	then event = 1;
	if event in (-1,1);
proc sort data=zzz; by identifier permno_split EFFDT PAYDT event_index event;
run;

*************************************************************************************************************************************************************************************************; 
* This is new but I think it makes sense:																			*;
* When looking at the price evolution prior to a split, we observe that companies on average only enter the high price range 6 months prior to the split.					*;
* Shifts in comovement are computed by comparing the beta w.r.t. the low- and high price range one year before and one year after the split.							*;
* If some firm spends most of its time in the year prior to the split in the low price range, then we would not necessarily expect to detect a shift in price-based return comovement		*;
* even if investors categorized stocks based on price (e.g., prior to the split, a firm spends 90% of its time in the low price range - after the split, the firm spends 100% of its time	*;
* in the low price range. Can we really expect to see a meaningful change in return comovement w.r.t. low- and high price range after the split?)						*;
* Here, I start computing how much a stock comoves with the high- and low price range prior to the split only after the stock enters the high price range for the very first time.		*;
* I require there to be a minimum of 60 observations before and after the split. 														*;
*************************************************************************************************************************************************************************************************; 
data a; set zzz;
	if prc > b2 then cross=1; else cross=0;				
	if cross=1;
run;
proc sort data=a nodupkey; by identifier permno_split EFFDT PAYDT event_index;
data a; set a;
	rename event_window = first_cross_b2;
	keep identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index event_window;
data zzz; merge zzz a; by identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index;
	if event_window > first_cross_b2;
run;


proc means data=zzz noprint;
	var ret;
	by identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index event;
	output out=count n=obs;
data count; set count;
	drop _TYPE_ _FREQ_;
run;

proc reg data=zzz outest=parms1 noprint;
	model ret = low_vw high_vw;
	by identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index event;
run;
quit;
data parms1; set parms1;
	keep identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index event low_vw high_vw;
run;

proc reg data=zzz outest=parms2 noprint rsquare;
	model ret = low_vw;
	by identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index event;
run;
quit;
data parms2; set parms2;
	rename low_vw = low_vw_univariate;
	rename _RSQ_ = rsquared;
	keep identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index event low_vw _RSQ_;
run;

proc reg data=zzz outest=parms3 noprint rsquare;
	model ret = high_vw;
	by identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index event;
run;
quit;
data parms3; set parms3;
	rename high_vw = high_vw_univariate;
	rename _RSQ_ = rsquared;
	keep identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index event high_vw _RSQ_;
run;

data parms; merge parms1 (in=m1) parms2 parms3 count; by identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index event;
	if m1;
run;


data before; set parms;
	if event=-1;
	rename rsquared = before_rsquared;
	rename low_vw_univariate  = before_low_uni;
	rename high_vw_univariate = before_high_uni;
	rename low_vw  	= before_low;
	rename high_vw 	= before_high;
	rename obs 	= before_obs;
	drop event;
data after; set parms;
	if event=1;
	rename rsquared = after_rsquared;
	rename low_vw_univariate  = after_low_uni;
	rename high_vw_univariate = after_high_uni;
	rename low_vw  	= after_low;
	rename high_vw 	= after_high;
	rename obs 	= after_obs;
	drop event;
run;

data main; merge before (in=m1) after (in=m2); by identifier permno_split DCLRDT EFFDT PAYDT pre_split_prc event_index;
	if m1 and m2;
	diff_rsquared	=	after_rsquared - before_rsquared;
	diff_low_uni	=	after_low_uni - before_low_uni;
	diff_high_uni	=	after_high_uni - before_high_uni;
	diff_diff_uni	=   	diff_low_uni - diff_high_uni;
	diff_low	=	after_low  - before_low;
	diff_high	=	after_high - before_high;

	yyyymm		=	year(EFFDT)*100+month(EFFDT);

	if before_obs > 60;					*require a minimum of 60 observations before the split;
	if after_obs >  60;					*require a minimum of 60 observations after the split;

	if pre_split_prc > 10;

*	if 1926 <= year(EFFDT) <= 2004;
run;

proc surveymeans data=main mean t;
	cluster yyyymm;
	var diff_rsquared diff_low_uni diff_high_uni diff_diff_uni diff_low diff_high before_obs after_obs;
run;


*****************************************************************;
* Last time I checked (June 2015), for data from 1926 to 2014, 	*;
* these are the results I got for the bivarate regressions:	*;
* diff_low	+0.246010       (11.22)				*;
* diff_high	-0.109669       (-6.11)				*;
*****************************************************************;