From c98c69a69d7798ee7eb2e22e87b6a800da5148f5 Mon Sep 17 00:00:00 2001 From: velonisa Date: Fri, 30 Oct 2020 21:17:54 +0800 Subject: [PATCH 01/15] Update accounting_60.py --- char60/accounting_60.py | 511 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 465 insertions(+), 46 deletions(-) diff --git a/char60/accounting_60.py b/char60/accounting_60.py index 8789fb7..e205833 100644 --- a/char60/accounting_60.py +++ b/char60/accounting_60.py @@ -1,6 +1,8 @@ import pandas as pd import numpy as np +import datetime as dt import wrds +from dateutil.relativedelta import * from pandas.tseries.offsets import * import pickle as pkl from functions import * @@ -55,23 +57,24 @@ def ttm12(series, df): /*firm variables*/ /*income statement*/ f.sale, f.revt, f.cogs, f.xsga, f.dp, f.xrd, f.xad, f.ib, f.ebitda, - f.ebit, f.nopi, f.spi, f.pi, f.txp, f.ni, f.txfed, f.txfo, f.txt, f.xint, + f.ebit, f.nopi, f.spi, f.pi, f.txp, f.ni, f.txfed, f.txfo, f.txt, f.xint, f.xpp, f.xacc, /*CF statement and others*/ - f.capx, f.oancf, f.dvt, f.ob, f.gdwlia, f.gdwlip, f.gwo, f.mib, f.oiadp, f.ivao, + f.capx, f.oancf, f.dvt, f.ob, f.gdwlia, f.gdwlip, f.gwo, f.mib, f.oiadp, f.ivao, f.ivst, /*assets*/ f.rect, f.act, f.che, f.ppegt, f.invt, f.at, f.aco, f.intan, f.ao, f.ppent, f.gdwl, f.fatb, f.fatl, /*liabilities*/ f.lct, f.dlc, f.dltt, f.lt, f.dm, f.dcvt, f.cshrc, - f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, + f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, f.dltis, f.dltr. f.dlcch, /*equity and other*/ - f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, f.dpc, f.ajex, + f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, + f.dpc, f.ajex, f.tstkp, f.oibdp, f.capxv, f.dvpa, f.epspx, /*market*/ - abs(f.prcc_f) as prcc_f + abs(f.prcc_f) as prcc_f, abs(f.prcc_c) as prcc_c, f.dvc, f.prstkc, f.sstk, f.fopt, f.wcap from comp.funda as f left join comp.company as c @@ -91,6 +94,9 @@ def ttm12(series, df): # sort and clean up comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() +# # prep for clean-up and using time series of variables +comp['count'] = comp.groupby(['gvkey']).cumcount() # number of years in Compustat + # clean up csho comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) @@ -126,7 +132,7 @@ def ttm12(series, df): # Create a CRSP Subsample with Monthly Stock and Event Variables # Restrictions will be applied later # Select variables from the CRSP monthly stock and event datasets -crsp = conn.raw_sql(""" +crsp_m = conn.raw_sql(""" select a.prc, a.ret, a.retx, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.date, a.permno, a.permco, b.ticker, b.ncusip, b.shrcd, b.exchcd from crsp.msf as a @@ -139,35 +145,29 @@ def ttm12(series, df): """) # change variable format to int -crsp[['permco', 'permno', 'shrcd', 'exchcd']] = crsp[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) +crsp_m[['permco', 'permno', 'shrcd', 'exchcd']] = crsp_m[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) # Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) -crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month +crsp_m['date'] = pd.to_datetime(crsp_m['date']) +crsp_m['monthend'] = crsp_m['date'] + MonthEnd(0) # set all the date to the standard end date of month -crsp = crsp.dropna(subset=['prc']) -crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity +# calculate market equity +crsp_m['me'] = crsp_m['prc'].abs() * crsp_m['shrout'] # if Market Equity is Nan then let return equals to 0 -crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) -crsp['retx'] = np.where(crsp['me'].isnull(), 0, crsp['retx']) +crsp_m['ret'] = np.where(crsp_m['me'].isnull(), 0, crsp_m['ret']) +crsp_m['retx'] = np.where(crsp_m['me'].isnull(), 0, crsp_m['retx']) # impute me -crsp = crsp.sort_values(by=['permno', 'date']).drop_duplicates() -crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) +crsp_m = crsp_m.sort_values(by=['permno', 'date']).drop_duplicates() +crsp_m['me'] = np.where(crsp_m['permno'] == crsp_m['permno'].shift(1), crsp_m['me'].fillna(method='ffill'), crsp_m['me']) -# Aggregate Market Cap -''' -There are cases when the same firm (permco) has two or more securities (permno) at same date. -For the purpose of ME for the firm, we aggregated all ME for a given permco, date. -This aggregated ME will be assigned to the permno with the largest ME. -''' # sum of me across different permno belonging to same permco a given date -crsp_summe = crsp.groupby(['monthend', 'permco'])['me'].sum().reset_index() +crsp_summe = crsp_m.groupby(['monthend', 'permco'])['me'].sum().reset_index() # largest mktcap within a permco/date -crsp_maxme = crsp.groupby(['monthend', 'permco'])['me'].max().reset_index() +crsp_maxme = crsp_m.groupby(['monthend', 'permco'])['me'].max().reset_index() # join by monthend/maxme to find the permno -crsp1 = pd.merge(crsp, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) +crsp1 = pd.merge(crsp_m, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) # drop me column and replace with the sum me crsp1 = crsp1.drop(['me'], axis=1) # join with sum of me to get the correct market cap info @@ -198,14 +198,15 @@ def ttm12(series, df): ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) # we can only get the accounting data after the firm public their report -# for annual data, we use 5 or 6 months lagged data +# for annual data, we use 6 months lagged data ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) -ccm1['jdate'] = ccm1['datadate'] + MonthEnd(4) +ccm1['jdate'] = ccm1['yearend'] + MonthEnd(6) # set link date bounds ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] # link comp and crsp +# data_rawa only includes annul data because comp is annual. Use inner merge crsp2 = crsp2.rename(columns={'monthend': 'jdate'}) data_rawa = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) @@ -218,14 +219,14 @@ def ttm12(series, df): Note: me is CRSP market equity, mve_f is Compustat market equity. Please choose the me below. ''' data_rawa['me'] = data_rawa['me']/1000 # CRSP ME -# data_rawa['me'] = data_rawa['mve_f'] # Compustat ME +data_rawa['me_comp'] = data_rawa['mve_f'] # Compustat ME # there are some ME equal to zero since this company do not have price or shares data, we drop these observations data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) data_rawa = data_rawa.dropna(subset=['me']) # count single stock years -# data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() +data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() # deal with the duplicates data_rawa.loc[data_rawa.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 @@ -238,15 +239,19 @@ def ttm12(series, df): ####################################################################################################################### # Annual Variables # ####################################################################################################################### +# stockholders' equity +data_rawa['se'] = np.where(data_rawa['seq'].isnull(), data_rawa['ceq']+data_rawa['pstk'], data_rawa['seq']) +data_rawa['se'] = np.where(data_rawa['se'].isnull(), data_rawa['at']-data_rawa['lt'], data_rawa['se']) + +data_rawa['txditc'] = data_rawa['txditc'].fillna(0) + # preferrerd stock data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps']) -data_rawa['txditc'] = data_rawa['txditc'].fillna(0) - # book equity -data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] +data_rawa['be'] = data_rawa['se'] + data_rawa['txditc'] - data_rawa['ps'] data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) # acc @@ -276,8 +281,9 @@ def ttm12(series, df): # np.nan] # data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) -# ep -# data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] +# ep, checked from Hou and change 'ME' from compustat to crsp +#data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] +#data_rawa['ep_n'] = data_rawa['ib'] # ni data_rawa['csho_l1'] = data_rawa.groupby(['permno'])['csho'].shift(1) @@ -287,7 +293,7 @@ def ttm12(series, df): np.log(data_rawa['csho']*data_rawa['ajex']).replace(-np.inf, 0)- np.log(data_rawa['csho_l1']*data_rawa['ajex_l1']).replace(-np.inf, 0)) -# op +# op: the formula seems different from Hou Page 74? data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga']) @@ -297,6 +303,8 @@ def ttm12(series, df): data_rawa['op'] = np.select(condlist, choicelist, default=(data_rawa['revt'] - data_rawa['cogs0'] - data_rawa['xsga0'] - data_rawa['xint0'])/data_rawa['be']) + + # rsup data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) # data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] @@ -307,8 +315,9 @@ def ttm12(series, df): # lev # data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] -# sp +# sp, checked # data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] +#data_rawa['sp_n'] = data_rawa['sale'] # rd_sale data_rawa['rd_sale'] = data_rawa['xrd']/data_rawa['sale'] @@ -316,7 +325,7 @@ def ttm12(series, df): # rdm # data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] -# adm hxz adm +# adm hxz adm, checked # data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] # gma @@ -358,7 +367,7 @@ def ttm12(series, df): data_rawa['txt_l1'] = data_rawa.groupby(['permno'])['txt'].shift(1) data_rawa['chtx'] = (data_rawa['txt']-data_rawa['txt_l1'])/data_rawa['at_l1'] -# noa +# noa,checked data_rawa['noa'] = ((data_rawa['at']-data_rawa['che']-data_rawa['ivao'].fillna(0))- (data_rawa['at']-data_rawa['dlc'].fillna(0)-data_rawa['dltt'].fillna(0)-data_rawa['mib'].fillna(0) -data_rawa['pstk'].fillna(0)-data_rawa['ceq'])/data_rawa['at_l1']) @@ -578,6 +587,289 @@ def ttm12(series, df): data_rawa = data_rawa.drop(['herf'], axis=1) data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +################################## Added on 2020.10.29 ################################## +# bm +#data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] +#data_rawa['bm_n'] = data_rawa['be'] + +# Bmj +data_rawa['be_per'] = data_rawa['be'] / data_rawa['csho'] +data_rawa['bmj'] = data_rawa['be_per'] / data_rawa['prc'] +############### *Q*: used prc as share price from crsp ########## + +# Cp +data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] +#data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'] + +# Dp +###### *Q* difference return with without divident + +# Dur +# me = data_rawa['me_comp'] + + +# Ebp +data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) +data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) +data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] +data_rawa['f_asse'] = data_rawa['che'] +# net debt : = financial liabilities - financial assets. +data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] +data_rawa['be'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa'] +#data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['be']) / (data_rawa['n_debt']+data_rawa['me']) + + +# Em +#data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che'] +#data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'] + +############### Investment ############### +# Aci +data_rawa['ce'] = data_rawa['capx'] / data_rawa['sale'] +data_rawa['ce1'] = data_rawa['ce'].shift(1) +data_rawa['ce2'] = data_rawa['ce'].shift(2) +data_rawa['ce3'] = data_rawa['ce'].shift(3) +data_rawa['aci'] = data_rawa['ce']/ (data_rawa['ce1']+data_rawa['ce2']+data_rawa['ce3'])-1 + +# Cei +#data_rawa['lg_me'] = np.log(data_rawa['me']/data_rawa['me'].shift(6)) +#data_rawa['lg_ret'] = np.log(data_rawa['ret']*data_rawa['ret'].shift(1)*data_rawa['ret'].shift(2)*data_rawa['ret'].shift(3)*data_rawa['ret'].shift(5)*data_rawa['ret'].shift(6)) +#data_rawa['cei'] = data_rawa['lg_me'] - data_rawa['lg_ret'] + + +# Dac + + + +# dCoa +data_rawa['coa'] = data_rawa['act'] - data_rawa['che'] +data_rawa['dcoa'] = (data_rawa['coa']-data_rawa['coa'].shift(1)) / data_rawa['at'].shift(1) + + +# dBe +data_rawa['dBe'] = (data_rawa['ceq'] - data_rawa['ceq'].shift(1)) / data_rawa['at'].shift(1) + + +# dFnl & dFin +data_rawa['fna'] = data_rawa['ivst'] + data_rawa['ivao'] +data_rawa['fnl'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + +data_rawa['d_dlc'] = data_rawa['dlc'] - data_rawa['dlc'].shift(1) +data_rawa['d_dlc'] = np.where(data_rawa['d_dlc'].isnull(), 0, data_rawa['d_dlc']) +data_rawa['d_pstk'] = data_rawa['pstk'] - data_rawa['pstk'].shift(1) +data_rawa['d_pstk'] = np.where(data_rawa['d_pstk'].isnull(), 0, data_rawa['d_pstk']) + +data_rawa['dfnl'] = (data_rawa['dltt']-data_rawa['dltt'].shift(1)) + data_rawa['d_dlc'] + data_rawa['d_pstk'] + +data_rawa['d_ivst'] = data_rawa['ivst'] - data_rawa['ivst'].shift(1) +data_rawa['d_ivst'] = np.where(data_rawa['d_ivst'].isnull(), 0, data_rawa['d_ivst']) +data_rawa['d_ivao'] = data_rawa['ivao'] - data_rawa['ivao'].shift(1) +data_rawa['d_ivao'] = np.where(data_rawa['d_ivao'].isnull(), 0, data_rawa['d_ivao']) + +data_rawa['dfna'] = data_rawa['d_ivst'] + data_rawa['d_ivao'] +data_rawa['dfin'] = data_rawa['dfna'] - data_rawa['dfnl'] + +data_rawa['dfin'] = data_rawa['dfin'] / data_rawa['at'].shift(1) +data_rawa['dfnl'] = data_rawa['dfnl'] / data_rawa['at'].shift(1) + + + + +# dIi +data_rawa['e_invt'] = (data_rawa['capxv'] + data_rawa['capxv'].shift(1))/2 +data_rawa['dinvt'] = (data_rawa['capxv'] - data_rawa['e_invt']) / data_rawa['e_invt'] + +data_rawa['ind'] = data_rawa['capxv'] +s = data_rawa.groupby(['jdate', 'sic2'])['ind'].sum() +data_rawa = pd.merge(data_rawa, s, on=['jdate', 'sic2']) +# new industry investment will be named as ind_y, cause it's been grouped by ind +data_rawa['e_ind'] = (data_rawa['ind_y'] + data_rawa['ind_y'].shift(1))/2 +data_rawa['dind'] = (data_rawa['ind_y']-data_rawa['e_ind']) / data_rawa['e_ind'] +data_rawa['dIi'] = data_rawa['dinvt'] - data_rawa['dind'] + +# dLno +data_rawa['dlno'] = (data_rawa['ppent']-data_rawa['ppent'].shift(1)) + (data_rawa['intan']-data_rawa['intan'].shift(1)) + (data_rawa['ao']-data_rawa['ao'].shift(1)) - (data_rawa['lo']-data_rawa['lo'].shift(1)) + data_rawa['dp'] +avg_at = [] +for i in range(data_rawa.shape[0]): + avg_at.append(data_rawa.loc[0:i, 'at'].mean()) +data_rawa['avg_at'] = pd.DataFrame(avg_at) +data_rawa['dlno'] = data_rawa['dlno'] / data_rawa['avg_at'] + + +# dNco +data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao'] +data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt'] +data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] +data_rawa['dnoc'] = data_rawa['nco'] - data_rawa['nco'].shift(1) + + +# dNca +data_rawa['ivao_0'] = np.where(data_rawa['ivao'].isnull(), 0, data_rawa['ivao']) +data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) + +data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao_0'] +data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt_0'] +data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] +data_rawa['dnca'] = data_rawa['nco'] - data_rawa['nco'].shift(1) + + + +# dNoa +data_rawa['dlc_0'] = np.where(data_rawa['dlc'].isnull(), 0, data_rawa['dlc']) +data_rawa['mib_0'] = np.where(data_rawa['mib'].isnull(), 0, data_rawa['mib']) +data_rawa['pstk_0'] = np.where(data_rawa['pstk'].isnull(), 0, data_rawa['pstk']) + +data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] +data_rawa['op_lia'] = data_rawa['at'] - data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] +data_rawa['net_op'] = data_rawa['op_at'] - data_rawa['op_lia'] +data_rawa['dnoa'] = (data_rawa['net_op']-data_rawa['net_op'].shift(1))/ data_rawa['at'].shift(1) + + +# dPia +data_rawa['c_propty'] = data_rawa['ppegt'] - data_rawa['ppegt'].shift(1) +data_rawa['c_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) +data_rawa['dpia'] = (data_rawa['c_propty'] + data_rawa['c_invt']) / data_rawa['at'].shift(1) + + + + + +######### Profitability ########## +# Ato +data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] - data_rawa['ivao_0'] +data_rawa['op_lia'] = data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] +data_rawa['noa'] = data_rawa['op_at'] - data_rawa['op_lia'] +data_rawa['ato'] = data_rawa['sale'] / data_rawa['noa'].shift(1) + + +# Cla +data_rawa['d_rect'] = data_rawa['rect'] - data_rawa['rect'].shift(1) +data_rawa['d_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) +data_rawa['d_xpp'] = data_rawa['xpp'] - data_rawa['xpp'].shift(1) +data_rawa['d_dr'] = (data_rawa['drc']-data_rawa['drc'].shift(1)) + (data_rawa['drlt']-data_rawa['drlt'].shift(1)) +data_rawa['d_ap'] = data_rawa['ap'] - data_rawa['ap'].shift(1) +data_rawa['d_xacc'] = data_rawa['xacc'] - data_rawa['xacc'].shift(1) + +data_rawa['xrd_0'] = np.where(data_rawa['xrd'].isnull(), 0, data_rawa['xrd']) +data_rawa['d_rect_0'] = np.where(data_rawa['d_rect'].isnull(), 0, data_rawa['d_rect']) +data_rawa['d_invt_0'] = np.where(data_rawa['d_invt'].isnull(), 0, data_rawa['d_invt']) +data_rawa['d_xpp_0'] = np.where(data_rawa['d_xpp'].isnull(), 0, data_rawa['d_xpp']) +data_rawa['d_dr_0'] = np.where(data_rawa['d_dr'].isnull(), 0, data_rawa['d_dr']) +data_rawa['d_ap_0'] = np.where(data_rawa['d_ap'].isnull(), 0, data_rawa['d_ap']) +data_rawa['d_xacc_0'] = np.where(data_rawa['d_xacc'].isnull(), 0, data_rawa['d_xacc']) + +data_rawa['cla'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ + - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ + + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] +data_rawa['cla'] = data_rawa['cla'] / data_rawa['at'].shift(1) + + +# Cop +data_rawa['cop'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ + - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ + + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] +data_rawa['cop'] = data_rawa['cop'] / data_rawa['at'] + + +# Cto +data_rawa['cto'] = data_rawa['sale'] / data_rawa['at'].shift(1) + +#ir +''' +First calculate r(t-5,t). Then rb(t-5,t) and use Bm to perform linear regression and get residue +''' +#r(t-5,t):sum ret from t-5 to t (which is calendar year t-6 to t-1) +lag = pd.DataFrame() +for i in range(1,6): + lag['ret%s' % i] = data_rawa.groupby(['permno'])['ret'].shift(i) + +data_rawa['ret5'] = lag['ret1']+lag['ret2']+lag['ret3']+lag['ret4']+lag['ret5'] + +#bm_t-5 (bm of year t-5) +data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) + +#rB (five year log book return) +#Reference: jf_06 page8 by KENT DANIEL +data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] + +#Regression and get ir +#First get unique datelist +datelist = data_rawa['jdate'].unique() +for date in datelist: + temp = data_rawa['jdate' == date] + n_row = temp.shape[0] + index = temp.index + X = pd.DataFrame() + X['bm5'] = temp['bm5'] + X['rB'] = temp['rB'] + X['intercept'] = 1 + X = X[['intercept','rB','bm5']] + X = np.mat(X) + Y = np.mat(temp[['ret5']]) + #These are residuals on one date + res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + #put residuals back into data_rawa + data_rawa.loc[index,'ir'] = res + +#nop +#data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] +#data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] +#data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] ) + +#ocp +#data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf']) +#data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] +#data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) + +#dwc +data_rawa['dwc'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc']) +#data_rawa['dwc'] = data_rawa['dwc']/data_rawa['at_l1'] + +#I/A +data_rawa['ia'] = (data_rawa['at']/data_rawa['at_l1'])-1 + +#Ig +data_rawa['capx_l1'] = data_rawa.groupby('permno')['capx'].shift(1) +data_rawa['ig'] = data_rawa['capx']/data_rawa['capx_l1'] + +#2Ig +data_rawa['capx_l2'] = data_rawa.groupby('permno')['capx'].shift(2) +data_rawa['2ig'] = data_rawa['capx']/data_rawa['capx_l2'] + +#Ivc +data_rawa['atAvg'] = (data_rawa['at']+data_rawa['at_l1'])/2 +data_rawa['ivc'] = data_rawa['invt'] / data_rawa['atAvg'] + +#Ndf +data_rawa['ndf'] = data_rawa['dltis'] - data_rawa['dltr'] + data_rawa['dlcch'] + +#nsi +data_rawa['sps'] = data_rawa['csho'] * data_rawa['ajex'] +data_rawa['sps_l1'] = data_rawa.groupby('permno')['sps'].shift(1) +data_rawa['nsi'] = np.log(data_rawa['sps']/data_rawa['sps_l1']) + +#oa +data_rawa['txp'] = np.where(data_rawa['txp'].isnull(), 0, data_rawa['txp']) +data_rawa['oa'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc'] - data_rawa['txp']) - data_rawa['dp'] + +#Poa +data_rawa['poa'] = data_rawa['oa']/data_rawa['ni'] + +#Ta +data_rawa['ta'] = data_rawa['dwc'] + data_rawa['dnco'] + data_rawa['dfin'] + +#Ol +data_rawa['ol'] = (data_rawa['cogs'] + data_rawa['xsga'])/data_rawa['at'] + +#etr +data_rawa['txtpi'] = data_rawa['txt'] / data_rawa['pi'] +data_rawa['txtpi_l1'] = data_rawa.groupby('permno')['txtpi'].shift(1) +data_rawa['txtpi_l2'] = data_rawa.groupby('permno')['txtpi'].shift(2) +data_rawa['txtpi_l3'] = data_rawa.groupby('permno')['txtpi'].shift(3) +data_rawa['deps'] = data_rawa['epspx']/(data_rawa['ajex'] * data_rawa['prcc_f']) +data_rawa['etr'] = (data_rawa['txtpi'] - (data_rawa['txtpi_l1'] + data_rawa['txtpi_l2'] + data_rawa['txtpi_l3'])/3) * data_rawa['deps'] + + ####################################################################################################################### # Compustat Quarterly Raw Info # ####################################################################################################################### @@ -711,8 +1003,11 @@ def ttm12(series, df): # data_rawq['ibq4']/data_rawq['me'], # (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) -# ep -# data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] +# ep, also checked and change 'ME' from compustat to crsp +#data_rawq['ep'] = ttm4('ibq', data_rawq)/data_rawq['me'] +#data_rawq['ep_n'] = data_rawq['ep']*data_rawq['me'] +#data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] +#data_rawa['ep_n'] = data_rawa['ib'] # agr data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'] @@ -988,6 +1283,23 @@ def chars_std(start, end, df, chars): data_rawq = data_rawq.drop(['p_temp1', 'p_temp2', 'p_temp3', 'p_temp4', 'p_temp5', 'p_temp6', 'p_temp7', 'p_temp8', 'p_temp9'], axis=1) +################################## Added on 2020.10.29 ################################## +#Iaq +data_rawq['atqlag'] = ttm4('atq',data_rawq) +data_rawq['iaq'] = (data_rawq['atq']/data_rawq['atqlag'])-1 + +#Almq +data_rawq['intanq'] = np.where(data_rawq['intanq'].isnull(), 0, data_rawq['intanq']) +data_rawq['qal'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq']) + 0.5*(data_rawq['atq'] - data_rawq['actq'] - data_rawq['intanq']) +data_rawq['mveqa'] = data_rawq['atq'] + data_rawq['mveq_f'] - data_rawq['ceqq'] +data_rawq['mveqa_1'] = data_rawq.groupby(['permno'])['mveqa'].shift(1) +data_rawq['almq'] = data_rawq['qal']/data_rawq['mveqa_1'] + +#Olq, needs atq +data_rawa['olq'] = (data_rawa['cogsq'] + data_rawa['xsgaq'])/data_rawa['atq'] + +# rds +data_rawq['rds'] = data_rawq['xrdq4']/data_rawq['saleq'] ####################################################################################################################### # Momentum # ####################################################################################################################### @@ -998,8 +1310,11 @@ def chars_std(start, end, df, chars): """) crsp_mom['permno'] = crsp_mom['permno'].astype(int) +crsp_mom['date'] = pd.to_datetime(crsp_mom['date']) crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) crsp_mom = crsp_mom.dropna() +# populate the chars to monthly +crsp_mom['jdate'] = crsp_mom['date'] + MonthEnd(0) # add delisting return dlret = conn.raw_sql(""" @@ -1017,11 +1332,65 @@ def chars_std(start, end, df, chars): crsp_mom['ret'] = crsp_mom['ret'].fillna(0) crsp_mom['retadj'] = (1 + crsp_mom['ret']) * (1 + crsp_mom['dlret']) - 1 crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity +crsp_mom['retx'] = np.where(crsp_mom['me'].isnull(), 0, crsp_mom['retx']) +crsp_mom = crsp_mom.drop(['dlret', 'dlstdt', 'prc', 'shrout'], axis=1) + +#Seasonality + +#Rla +crsp_mom['rla'] = crsp_mom.groupby['permno']['ret'].shift(12) + +#Rln +lag = pd.DataFrame() +result = 0 +for i in range(1, 12): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['rln'] = result/11 + +#R[2,5]a +#R[2,5]n +lag = pd.DataFrame() +result = 0 +for i in range(13,61): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + if i not in [24,36,48,60]: + result = result + lag['mom%s' % i] + +crsp_mom['r25a'] = (lag['mom24']+lag['mom36']+lag['mom48']+lag['mom60'])/4 +crsp_mom['r25n'] = result/44 + +#R[6,10]a +#R[6,10]n +lag = pd.DataFrame() +result = 0 +for i in range(61,121): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + if i not in [72,84,96,108,120]: + result = result + lag['mom%s' % i] + +crsp_mom['r610a'] = (lag['mom72']+lag['mom84']+lag['mom96']+lag['mom108']+lag['mom120'])/5 +crsp_mom['r610n'] = result/55 + +#R[11,15]a +lag = pd.DataFrame() +result = 0 +for i in [132,144,156,168,180]: + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['r1115a'] = result/5 + +#R[16,20]a +lag = pd.DataFrame() +result = 0 +for i in [192,204,216,228,240]: + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['r1620a'] = result/5 def mom(start, end, df): """ - :param start: Order of starting lag :param end: Order of ending lag :param df: Dataframe @@ -1104,6 +1473,7 @@ def mom(start, end, df): # bm data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] +data_rawa['bm_n'] = data_rawa['be'] # bm_ia df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() @@ -1124,8 +1494,9 @@ def mom(start, end, df): np.nan] data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) -# ep +# ep, checked from Hou and change 'ME' from compustat to crsp,checked data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] +data_rawa['ep_n'] = data_rawa['ib'] # rsup # data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) @@ -1134,18 +1505,52 @@ def mom(start, end, df): # lev data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] -# sp +# sp, checked data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] +data_rawa['sp_n'] = data_rawa['sale'] # rdm data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] -# adm hxz adm +# adm hxz adm,checked data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] # dy data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] +# Cp +#data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] +data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'] + +# Ebp +#data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) +#data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) +#data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] +#data_rawa['f_asse'] = data_rawa['che'] +# net debt : = financial liabilities - financial assets. +#data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] +#data_rawa['be'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa'] +data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['be']) / (data_rawa['n_debt']+data_rawa['me']) + +# Em +data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che'] +data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'] + +# Cei +data_rawa['lg_me'] = np.log(data_rawa['me']/data_rawa['me'].shift(6)) +data_rawa['lg_ret'] = np.log(data_rawa['ret']*data_rawa['ret'].shift(1)*data_rawa['ret'].shift(2)*data_rawa['ret'].shift(3)*data_rawa['ret'].shift(5)*data_rawa['ret'].shift(6)) +data_rawa['cei'] = data_rawa['lg_me'] - data_rawa['lg_ret'] + +#nop +data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] +data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] +data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] ) + +#ocp +data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf']) +data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] +data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) + # Annual Accounting Variables chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', 'sic', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', 'rsup', 'cash', 'chcsho', @@ -1156,7 +1561,12 @@ def mom(start, end, df): 'salecash', 'salerec', 'saleinv', 'pchsaleinv', 'realestate', 'obklg', 'chobklg', 'grltnoa', 'conv', 'chdrc', 'rdbias', 'operprof', 'capxint', 'xadint', 'chpm', 'ala', 'alm', 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'hire', 'herf', 'bm_ia', - 'me_ia']] + 'me_ia','be', 'bmj','cp', 'ebp', 'em', 'ib', 'dp', 'dvpa', 'tstkp', 'dltt', 'dlc', + 'pstk', 'che', 'ceq', 'pstkrv', 'oibdp', 'aci', 'capx', 'sale', 'at', 'dpia', 'ppegt', + 'invt', 'cei', 'dBe', 'dfnl', 'dfin', 'ivst', 'ivao', 'dcoa', 'act', + 'dlno', 'ppent', 'intan', 'ao', 'lo', 'dnoc', 'lt', 'lct', 'dnoa', 'mib', + 'cla', 'revt', 'cogs', 'xsga', 'xrd', 'rect', 'xpp', 'drc', 'drlt', 'ap', 'xacc', + 'cop', 'cto', 'dIi', 'sic2', 'capxv']] chars_a.reset_index(drop=True, inplace=True) ######################################## @@ -1170,8 +1580,10 @@ def mom(start, end, df): data_rawq['ibq4']/data_rawq['me'], (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) -# ep -data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] +# ep, also checked and change 'ME' from compustat to crsp +#data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] +data_rawq['ep'] = ttm4('ibq', data_rawq)/data_rawq['me'] +data_rawq['ep_n'] = data_rawq['ep']*data_rawq['me'] # lev data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] @@ -1202,6 +1614,13 @@ def mom(start, end, df): 'turn', 'dolvol']] chars_q.reset_index(drop=True, inplace=True) +# chars_a +#chars_a = pd.merge(crsp_mom, chars_a, how='left', on=['permno', 'jdate']) +#chars_a['datadate'] = chars_a.groupby(['permno'])['datadate'].fillna(method='ffill') +#chars_a = chars_a.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') +#chars_a = chars_a[((chars_a['exchcd'] == 1) | (chars_a['exchcd'] == 2) | (chars_a['exchcd'] == 3)) & +# ((chars_a['shrcd'] == 10) | (chars_a['shrcd'] == 11))] + with open('chars_a_60.pkl', 'wb') as f: pkl.dump(chars_a, f) From c9433645d2ceea795337aed7d79a55f586d972f0 Mon Sep 17 00:00:00 2001 From: velonisa Date: Sat, 31 Oct 2020 22:25:04 +0800 Subject: [PATCH 02/15] Update accounting_60.py --- char60/accounting_60.py | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/char60/accounting_60.py b/char60/accounting_60.py index e205833..27b11b6 100644 --- a/char60/accounting_60.py +++ b/char60/accounting_60.py @@ -132,7 +132,7 @@ def ttm12(series, df): # Create a CRSP Subsample with Monthly Stock and Event Variables # Restrictions will be applied later # Select variables from the CRSP monthly stock and event datasets -crsp_m = conn.raw_sql(""" +crsp = conn.raw_sql(""" select a.prc, a.ret, a.retx, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.date, a.permno, a.permco, b.ticker, b.ncusip, b.shrcd, b.exchcd from crsp.msf as a @@ -145,29 +145,29 @@ def ttm12(series, df): """) # change variable format to int -crsp_m[['permco', 'permno', 'shrcd', 'exchcd']] = crsp_m[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) +crsp[['permco', 'permno', 'shrcd', 'exchcd']] = crsp[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) # Line up date to be end of month -crsp_m['date'] = pd.to_datetime(crsp_m['date']) -crsp_m['monthend'] = crsp_m['date'] + MonthEnd(0) # set all the date to the standard end date of month +crsp['date'] = pd.to_datetime(crsp['date']) +crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month # calculate market equity -crsp_m['me'] = crsp_m['prc'].abs() * crsp_m['shrout'] +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # if Market Equity is Nan then let return equals to 0 -crsp_m['ret'] = np.where(crsp_m['me'].isnull(), 0, crsp_m['ret']) -crsp_m['retx'] = np.where(crsp_m['me'].isnull(), 0, crsp_m['retx']) +crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) +crsp['retx'] = np.where(crsp['me'].isnull(), 0, crsp['retx']) # impute me -crsp_m = crsp_m.sort_values(by=['permno', 'date']).drop_duplicates() -crsp_m['me'] = np.where(crsp_m['permno'] == crsp_m['permno'].shift(1), crsp_m['me'].fillna(method='ffill'), crsp_m['me']) +crsp = crsp.sort_values(by=['permno', 'date']).drop_duplicates() +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) # sum of me across different permno belonging to same permco a given date -crsp_summe = crsp_m.groupby(['monthend', 'permco'])['me'].sum().reset_index() +crsp_summe = crsp.groupby(['monthend', 'permco'])['me'].sum().reset_index() # largest mktcap within a permco/date -crsp_maxme = crsp_m.groupby(['monthend', 'permco'])['me'].max().reset_index() +crsp_maxme = crsp.groupby(['monthend', 'permco'])['me'].max().reset_index() # join by monthend/maxme to find the permno -crsp1 = pd.merge(crsp_m, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) +crsp1 = pd.merge(crsp, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) # drop me column and replace with the sum me crsp1 = crsp1.drop(['me'], axis=1) # join with sum of me to get the correct market cap info @@ -588,10 +588,6 @@ def ttm12(series, df): data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) ################################## Added on 2020.10.29 ################################## -# bm -#data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] -#data_rawa['bm_n'] = data_rawa['be'] - # Bmj data_rawa['be_per'] = data_rawa['be'] / data_rawa['csho'] data_rawa['bmj'] = data_rawa['be_per'] / data_rawa['prc'] @@ -1473,7 +1469,7 @@ def mom(start, end, df): # bm data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] -data_rawa['bm_n'] = data_rawa['be'] +#data_rawa['bm_n'] = data_rawa['be'] # bm_ia df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() @@ -1496,7 +1492,7 @@ def mom(start, end, df): # ep, checked from Hou and change 'ME' from compustat to crsp,checked data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] -data_rawa['ep_n'] = data_rawa['ib'] +#data_rawa['ep_n'] = data_rawa['ib'] # rsup # data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) @@ -1507,7 +1503,7 @@ def mom(start, end, df): # sp, checked data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] -data_rawa['sp_n'] = data_rawa['sale'] +#data_rawa['sp_n'] = data_rawa['sale'] # rdm data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] @@ -1583,7 +1579,7 @@ def mom(start, end, df): # ep, also checked and change 'ME' from compustat to crsp #data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] data_rawq['ep'] = ttm4('ibq', data_rawq)/data_rawq['me'] -data_rawq['ep_n'] = data_rawq['ep']*data_rawq['me'] +#data_rawq['ep_n'] = data_rawq['ep']*data_rawq['me'] # lev data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] From 4e16b2f79db00e735b276b1c320d14e40eb191c9 Mon Sep 17 00:00:00 2001 From: velonisa Date: Sun, 1 Nov 2020 13:02:32 +0800 Subject: [PATCH 03/15] Update accounting_60.py --- char60/accounting_60.py | 71 +++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 42 deletions(-) diff --git a/char60/accounting_60.py b/char60/accounting_60.py index 27b11b6..dc4d70b 100644 --- a/char60/accounting_60.py +++ b/char60/accounting_60.py @@ -94,9 +94,6 @@ def ttm12(series, df): # sort and clean up comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() -# # prep for clean-up and using time series of variables -comp['count'] = comp.groupby(['gvkey']).cumcount() # number of years in Compustat - # clean up csho comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) @@ -151,8 +148,8 @@ def ttm12(series, df): crsp['date'] = pd.to_datetime(crsp['date']) crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month -# calculate market equity -crsp['me'] = crsp['prc'].abs() * crsp['shrout'] +crsp = crsp.dropna(subset=['prc']) +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity # if Market Equity is Nan then let return equals to 0 crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) @@ -162,6 +159,12 @@ def ttm12(series, df): crsp = crsp.sort_values(by=['permno', 'date']).drop_duplicates() crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) +# Aggregate Market Cap +''' +There are cases when the same firm (permco) has two or more securities (permno) at same date. +For the purpose of ME for the firm, we aggregated all ME for a given permco, date. +This aggregated ME will be assigned to the permno with the largest ME. +''' # sum of me across different permno belonging to same permco a given date crsp_summe = crsp.groupby(['monthend', 'permco'])['me'].sum().reset_index() # largest mktcap within a permco/date @@ -198,15 +201,14 @@ def ttm12(series, df): ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) # we can only get the accounting data after the firm public their report -# for annual data, we use 6 months lagged data +# for annual data, we use 5 or 6 months lagged data ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) -ccm1['jdate'] = ccm1['yearend'] + MonthEnd(6) +ccm1['jdate'] = ccm1['datadate'] + MonthEnd(4) # set link date bounds ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] # link comp and crsp -# data_rawa only includes annul data because comp is annual. Use inner merge crsp2 = crsp2.rename(columns={'monthend': 'jdate'}) data_rawa = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) @@ -219,14 +221,14 @@ def ttm12(series, df): Note: me is CRSP market equity, mve_f is Compustat market equity. Please choose the me below. ''' data_rawa['me'] = data_rawa['me']/1000 # CRSP ME -data_rawa['me_comp'] = data_rawa['mve_f'] # Compustat ME +# data_rawa['me'] = data_rawa['mve_f'] # Compustat ME # there are some ME equal to zero since this company do not have price or shares data, we drop these observations data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) data_rawa = data_rawa.dropna(subset=['me']) # count single stock years -data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() +# data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() # deal with the duplicates data_rawa.loc[data_rawa.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 @@ -251,7 +253,7 @@ def ttm12(series, df): data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps']) # book equity -data_rawa['be'] = data_rawa['se'] + data_rawa['txditc'] - data_rawa['ps'] +data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) # acc @@ -696,7 +698,7 @@ def ttm12(series, df): data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao'] data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt'] data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] -data_rawa['dnoc'] = data_rawa['nco'] - data_rawa['nco'].shift(1) +data_rawa['dnco'] = data_rawa['nco'] - data_rawa['nco'].shift(1) # dNca @@ -731,11 +733,11 @@ def ttm12(series, df): ######### Profitability ########## -# Ato -data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] - data_rawa['ivao_0'] -data_rawa['op_lia'] = data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] -data_rawa['noa'] = data_rawa['op_at'] - data_rawa['op_lia'] -data_rawa['ato'] = data_rawa['sale'] / data_rawa['noa'].shift(1) +# Ato,repeated +#data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] - data_rawa['ivao_0'] +#data_rawa['op_lia'] = data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] +#data_rawa['noa'] = data_rawa['op_at'] - data_rawa['op_lia'] +#data_rawa['ato'] = data_rawa['sale'] / data_rawa['noa'].shift(1) # Cla @@ -999,11 +1001,8 @@ def ttm12(series, df): # data_rawq['ibq4']/data_rawq['me'], # (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) -# ep, also checked and change 'ME' from compustat to crsp -#data_rawq['ep'] = ttm4('ibq', data_rawq)/data_rawq['me'] -#data_rawq['ep_n'] = data_rawq['ep']*data_rawq['me'] -#data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] -#data_rawa['ep_n'] = data_rawa['ib'] +# ep +# data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] # agr data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'] @@ -1549,7 +1548,7 @@ def mom(start, end, df): # Annual Accounting Variables chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', - 'sic', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', 'rsup', 'cash', 'chcsho', + 'sic', 'retadj', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', 'rsup', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'adm', 'sgr', 'sp', 'invest', 'roe', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'chato', 'chtx', 'noa', 'rna', 'pm', 'ato', 'dy', 'roic', 'chinv', 'pchsale_pchinvt', 'pchsale_pchrect', 'pchgm_pchsale', 'pchsale_pchxsga', @@ -1557,12 +1556,9 @@ def mom(start, end, df): 'salecash', 'salerec', 'saleinv', 'pchsaleinv', 'realestate', 'obklg', 'chobklg', 'grltnoa', 'conv', 'chdrc', 'rdbias', 'operprof', 'capxint', 'xadint', 'chpm', 'ala', 'alm', 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'hire', 'herf', 'bm_ia', - 'me_ia','be', 'bmj','cp', 'ebp', 'em', 'ib', 'dp', 'dvpa', 'tstkp', 'dltt', 'dlc', - 'pstk', 'che', 'ceq', 'pstkrv', 'oibdp', 'aci', 'capx', 'sale', 'at', 'dpia', 'ppegt', - 'invt', 'cei', 'dBe', 'dfnl', 'dfin', 'ivst', 'ivao', 'dcoa', 'act', - 'dlno', 'ppent', 'intan', 'ao', 'lo', 'dnoc', 'lt', 'lct', 'dnoa', 'mib', - 'cla', 'revt', 'cogs', 'xsga', 'xrd', 'rect', 'xpp', 'drc', 'drlt', 'ap', 'xacc', - 'cop', 'cto', 'dIi', 'sic2', 'capxv']] + 'me_ia', 'bmj','cp', 'ebp', 'em', 'dp', 'aci', 'dpia', 'dBe', 'dfnl', 'dfin', 'dcoa', + 'dlno', 'dnoa', 'cla', 'cop', 'cto', 'dIi', 'dnco', 'dnca', 'ir', 'nop', 'ocp', + 'ia', 'ig','2ig','ivc','ndf','nsi','oa','poa','ta','ol','etr']] chars_a.reset_index(drop=True, inplace=True) ######################################## @@ -1576,10 +1572,8 @@ def mom(start, end, df): data_rawq['ibq4']/data_rawq['me'], (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) -# ep, also checked and change 'ME' from compustat to crsp -#data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] -data_rawq['ep'] = ttm4('ibq', data_rawq)/data_rawq['me'] -#data_rawq['ep_n'] = data_rawq['ep']*data_rawq['me'] +# ep +data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] # lev data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] @@ -1601,22 +1595,15 @@ def mom(start, end, df): data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') # Quarterly Accounting Variables -chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd', 'acc', 'bm', 'cfp', +chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd','retadj' ,'acc', 'bm', 'cfp', 'ep', 'agr', 'ni', 'op', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'sgr', 'sp', 'invest', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'roe', 'chato', 'chpm', 'chtx', 'noa', 'rna', 'pm', 'ato', 'stdcf', 'grltnoa', 'ala', 'alm', 'rsup', 'stdacc', 'sgrvol', 'roavol', 'scf', 'cinvest', 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'pscore', 'nincr', - 'turn', 'dolvol']] + 'turn', 'dolvol', 'iaq', 'almq', 'olq', 'rds']] chars_q.reset_index(drop=True, inplace=True) -# chars_a -#chars_a = pd.merge(crsp_mom, chars_a, how='left', on=['permno', 'jdate']) -#chars_a['datadate'] = chars_a.groupby(['permno'])['datadate'].fillna(method='ffill') -#chars_a = chars_a.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') -#chars_a = chars_a[((chars_a['exchcd'] == 1) | (chars_a['exchcd'] == 2) | (chars_a['exchcd'] == 3)) & -# ((chars_a['shrcd'] == 10) | (chars_a['shrcd'] == 11))] - with open('chars_a_60.pkl', 'wb') as f: pkl.dump(chars_a, f) From 68df1871b2275e8d7622d49096cb2992d3c77d43 Mon Sep 17 00:00:00 2001 From: velonisa Date: Sun, 1 Nov 2020 18:12:05 +0800 Subject: [PATCH 04/15] Update accounting_60.py --- char60/accounting_60.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/char60/accounting_60.py b/char60/accounting_60.py index dc4d70b..99ff3b4 100644 --- a/char60/accounting_60.py +++ b/char60/accounting_60.py @@ -1308,8 +1308,6 @@ def chars_std(start, end, df, chars): crsp_mom['date'] = pd.to_datetime(crsp_mom['date']) crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) crsp_mom = crsp_mom.dropna() -# populate the chars to monthly -crsp_mom['jdate'] = crsp_mom['date'] + MonthEnd(0) # add delisting return dlret = conn.raw_sql(""" From a7de346492d4d6a94de2f38f734512ac693cae88 Mon Sep 17 00:00:00 2001 From: velonisa Date: Tue, 3 Nov 2020 21:53:46 +0800 Subject: [PATCH 05/15] Update Chars_Description_1011.csv --- Chars_Description_1011.csv | 251 ++++++++++++++++++------------------- 1 file changed, 125 insertions(+), 126 deletions(-) diff --git a/Chars_Description_1011.csv b/Chars_Description_1011.csv index f602421..870f9da 100755 --- a/Chars_Description_1011.csv +++ b/Chars_Description_1011.csv @@ -1,126 +1,125 @@ -Num,Acronym,Description,Author,Pub Year,Category -A.2.1,Bm,Book-to-market equity,"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth -A.2.2,Bmj,Book-to-June-end market equity, Asness and Frazzini ,2013,Value-versus-growth -A.2.3,Bmq12,Quarterly Book-to-market Equity (12-month holding period),"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth -A.2.9,Ep,Earnings-to-price,Basu,1983,Value-versus-growth -A.2.10,"Ep q 1, Ep q 6, and Ep q 12","Q Quarterly Earnings-to-price(1-month holding period), uarterly Earnings-to-price(6-month holding period), Quarterly Earnings-to-price(12-month holding period), ",Basu,1983,Value-versus-growth -A.2.12,Cp,Cash flow-to-price,"Lakonishok, Shleifer, and Vishny ",1994,Value-versus-growth -A.2.13,"Cpq1, Cpq6, Cpq12","Quarterly Cash Flow-to-price (1-month holding period), Quarterly Cash Flow-to-price (6-month holding period), Quarterly Cash Flow-to-price (12-month holding period)","Lakonishok, Shleifer, and Vishny",1994,Value-versus-growth -A.2.14,Dp(dy),Dividend yield,Litzenberger and Ramaswamy,1979,Value-versus-growth -A.2.16,Op and Nop, (Net) Payout Yield,"Richardson, and Roberts",2007,Value-versus-growth -A.2.20,Em,Enterprise multiple,Loughran and Wellman,2011,Value-versus-growth -A.2.21,"Emq1, Emq6","Quarterly Enterprise multiple (1-month holding period), Quarterly Enterprise multiple (6-month holding period)",Loughran and Wellman,2011,Value-versus-growth -A.2.22,Sp,Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth -A.2.23,"Sp q 1, Sp q 6, and Sp q 12",Quarterly Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth -A.2.24,Ocp,Operating Cash Flow-to-price,"Desai, Rajgopal, and Venkatachalam",2004,Value-versus-growth -A.2.26,Ir,Intangible Return,Daniel and Titman,2006,Value-versus-growth -A.2.28,Ebp,Enterprise Book-to-price,"Penman, Richardson, and Tuna",2007,Value-versus-growth -A.3.1,Aci,Abnormal Corporate Investment,"Titman, Wei, and Xie",2004,Investment -A.3.2,I/A, Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment -A.3.3,"Ia q 6, and Ia q 12",Quarterly Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment -A.3.4,dPia,Changes in PPE and Inventory-to-assets,"Lyandres, Sun, and Zhang",2008,Investment -A.3.5,Noa and dNoa,(Changes in) Net Operating Assets,"Hirshleifer, Hou, Teoh, and Zhang",2004,Investment -A.3.6,dLno,Changes in Long-term Net Operating Assets,"Fairfield, Whisenant, and Yohn",2003,Investment -A.3.7,Ig,Investment Growth,Xing,2008,Investment -A.3.8,2Ig,2-year Investment Growth,Anderson and Garcia-Feijoo,2006,Investment -A.3.10,Nsi,Net Stock Issues,Pontiff and Woodgate,2008,Investment -A.3.11,dIi,% Change in Investment - % Change in Industry Investment,Abarbanell and Bushee,1998,Investment -A.3.14,Ivg,Inventory Growth,Belo and Lin,2011,Investment -A.3.15,Ivc,Inventory Changes,Thomas and Zhang,2002,Investment -A.3.16,Oa(acc),Operating Accruals,Sloan,1996,Investment -A.3.17,Ta,Total Accruals,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.18,dCoa,changes in Current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.19,dNca,changes in Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.19,dNco,Changes in Net Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.20,dFin,Changes in Net Financial Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.20,dFnl,changes in Financial Liabilities,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.20,dBe,changes in Book Equity,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.22,Poa,Percent operating accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment -A.3.23,Pta,Percent total accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment -A.3.24,Pda,Percent discretionary accruals,,,Investment -A.3.25,Ndf,Net debt finance,"Bradshaw, Richardson, and Sloan",2006,Investment -A.4.1,"Roe1, Roe6",Return on Equity,"Hou, Xue, and Zhang",2015,Profitability -A.4.2,"dRoe1, dRoe6, and dRoe12",Changes in Return on Equity,"Hou, Xue, and Zhang",2015,Profitability -A.4.3,Roa1,Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability -A.4.4,"dRoa1, dRoa6",Changes in Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability -A.4.5,Ato,Asset Turnover,Soliman,2008,Profitability -A.4.6,Cto,Capital Turnover,Haugen and Baker,1996,Profitability -A.4.7,"Rna q 1, Rna q 6, Atoq1","Quarterly Return on Net Operating Assets, Quarterly Asset Turnover",Soliman,2008,Profitability -A.4.8,"Cto q1, Cto q6,",Quarterly Capital Turnover,Haugen and Baker,1996,Profitability -A.4.9,Gpa,Gross Profits-to-assets,Novy-Marx,2013,Profitability -A.4.11,"Gla q 1, Gla q 6, and Gla q 12",Quarterly Gross Profits-to-lagged Assets,,,Profitability -A.4.12,Ope,Operating Profits to Equity,Fama and French,2015,Profitability -A.4.14,"Ole q 1, Ole q 6 ",Quarterly Operating Profits-to-lagged Equity,,,Profitability -A.4.15,Opa,Operating Profits-to-assets,"Linnainmaa, and Nikolaev",2015,Profitability -A.4.17,"Ola q 1, Ola q 6, and Ola q 12",Quarterly Operating Profits-to-lagged Assets,,,Profitability -A.4.18,Cop,Cash-based Operating Profitability,"Gerakos, Linnainmaa, and Nikolaev",2016,Profitability -A.4.19,Cla,Cash-based Operating Profits-to-lagged Assets,,,Profitability -A.4.20,Claq,Quarterly Cash-based Operating Profits-to-lagged Assets,,,Profitability -A.4.29,Tbi q 12,Quarterly Taxable Income-to-book Income,"Green, Hand, and Zhang",2013,Profitability -A5.1,Oca and Ioca,(Industry-adjusted) Organizational Capital-to-assets,Eisfeldt and Papanikolaou,2013,Intangibles -A.5.2,Adm,Advertising Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles -A.5.4,Rdm,R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles -A.5.5,"Rdm q 1, Rdm q 6, and Rdm q 12",Quarterly R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles -A.5.6,Rds q 6 and Rds q 12,Quarterly R&D Expense-to-sales,"Chan, Lakonishok, and Sougiannis",2001,Intangibles -A.5.8,Ol,Operating Leverage,Novy-Marx,2011,Intangibles -A.5.9,"Ol q 1, Ol q 6, and Ol q 12",Quarterly Operating Leverage,Novy-Marx,2011,Intangibles -A.5.11,Rca,R&D Capital-to-assets,Li,2011,Intangibles -A.5.24,Etr,Effective Tax Rate,Abarbanell and Bushee,1998,Intangibles -A.5.46,"Alm q 1, Alm q 6, and Alm q 12",Quarterly Asset Liquidity,Ortiz-Molina and Phillips,2014,Intangibles -A.5.50,"R a 1 , R n 1 , R a [2,5] , R n[2,5] , R a[6,10] , R n[6,10] , R a[11,15] , and R a[16,20]",Seasonality,Heston and Sadka,2008,Intangibles -A.6.8,Beta1,Market Beta,Fama and MacBeth,1973,Frictions -A.6.13,Dtv12,"dollar trading volume, 12-month holding period","Brennan, Chordia, and Subrahmanyam",1998,Frictions -A.6.21,Isff1,"idiosyncratic skewness estimated from the Fama-French 3-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions -A.6.22,Isq1,"idiosyncratic skewness estimated from the q-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions -A.6.3,Ivff1,"idiosyncratic volatility estimated from the Fama-French 3-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions -A.6.5,Ivq1,"idiosyncratic volatility estimated from the q-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions -A.6.1,Me,the market equity,Banz,1981,Frictions -A.6.24,Srev,short-term reversal,Jegadeesh,1990,Frictions -A.6.7,Sv1,"systematic volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions -A.6.6,Tv1,"total volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions -A.1.2,Abr1,"cumulative abnormal returns around earnings announcement dates, 1-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum -A.1.2,Abr6,"cumulative abnormal returns around earnings announcement dates, 6-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum -A.1.2,Abr12,"cumulative abnormal returns around earnings announcement dates, 12-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum -,abr,Cumulative Abnormal Returns Around Earnings Announcement Dates,"We calculate cumulative abnormal stock return (Abr) around the latest quarterly earnings announcement date (Compustat quarterly item RDQ) (Chan, Jegadeesh, and Lakonishok 1996)): Abr_i = \sum_{d=-2}^{+1}r_{id}-r{md}, in which r id is stock i’s return on day d (with the earnings announced on day 0) and r md is the market index return. We cumulate returns until one (trading) day after the announcement date to account for the one-day-delayed reaction to earnings news. r md is the value-weighted market return for the Abr deciles with NYSE breakpoints and value-weighted returns, but is the equal-weighted market return with all-but-micro breakpoints and equal-weighted returns.",, -,agr,Asset growth,Annual percent change in total assets (at),, -,baspread,Bid-ask spread rolling 3m,Monthly average of daily bid-ask spread divided by average of daily spread,, -,beta,Beta rolling 3m,Estimated market beta from daily returns,, -,bm_ia,Industry-adjusted book to market,Industry adjusted book-to-market ratio,, -,cash,Cash holdings,Cash and cash equivalents divided by average total assets,, -,cashdebt,Cash flow to debt,Earnings before depreciation and extraordinary items (ib+dp) divided by avg. total liabilities (lt),, -,chcsho,Change in shares outstanding,Annual percent change in shares outstanding (csho),, -,chpm,Industry-adjusted change in profit margin,2-digit SIC - fiscal-year mean adjusted change in income before extraordinary items (ib) divided by sales (sale),, -,chtx,Change in tax expense,Percent change in total taxes (txtq) from quartert-4 to t,, -,cinvest,Corporate investment,"Change over one quarter in net PP&E (ppentq) divided by sales (saleq) - average of this variable for prior 3 quarters; if saleq = 0, then scale by 0.01",, -,depr,Depreciation / PP&E,Depreciation divided by PP&E,, -,dolvol,Dollar trading volume,Natural log of trading volume times price per share from month t-2,, -,gma,Gross profitability,Revenues (revt) minus cost of goods sold (cogs) divided by lagged total assets (at),, -,grltnoa,Growth in long-term net operating assets,Growth in long-term net operating assets,, -,herf,Industry sales concentration,2-digit SIC - fiscal-year sales concentration (sum of squared percent of sales in industry for each company).,, -,hire,Employee growth rate,Percent change in number of employees (emp),, -,ill,Illiquidity rolling 3m,Average of daily (absolute return / dollar volume).,, -,lev,Leverage,Total liabilities (lt) divided by fiscal year-end market capitalization,, -,lgr,Growth in long-term debt,Annual percent change in total liabilities (lt),, -,maxret,Maximum daily returns rolling 3m,Maximum daily return from returns during calendar montht-1,, -,me_ia,Industry-adjusted size,2-digit SIC industry-adjusted fiscal year-end market capitalization,, -,mom12m,Momentum rolling 12m,11-month cumulative returns ending one month before month end,, -,mom1m,Momentum ,1-month cumulative return,, -,mom36m,Momentum rolling 36m,Cumulative returns from monthst-36 to t-13,, -,mom60m,Momentum rolling 60m,Cumulative returns from monthst-60 to t-13,, -,mom6m,Momentum rolling 6m,5-month cumulative returns ending one month before month end,, -,nincr,Number of earnings increases,Number of consecutive quarters (up to eight quarters) with an increase in earnings (ibq) over same quarter in the prior year,, -,op,Operating profitability,"Following Fama and French (2015), we measure operating profitability to equity, Ope, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS, zero if missing), minus selling, general, and administrative expenses (item XSGA, zero if missing), and minus interest expense (item XINT, zero if missing), scaled by book equity (the denominator is current, not lagged, book equity). We require at least one of the three expense items (COGS, XSGA, and XINT) to be non-missing. Book equity is stockholders’ book equity, plus balance sheet deferred taxes and investment tax credit (item TXDITC) if available, minus the book value of preferred stock. Stockholders’ equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders’ equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depending on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock.",, -,pm,profit margin,"Soliman (2008) use DuPont analysis to decompose Roe as Rna + FLEV × SPREAD, in which Roe is return on equity, Rna is return on net operating assets, FLEV is financial leverage, and SPREAD is the difference between return on net operating assets and borrowing costs. We can further decompose Rna as Pm × Ato, in which Pm is profit margin and Ato is asset turnover. Pm is operating income after depreciation divided by sales (item SALE) for the fiscal year ending in calendar year t − 1.",, -,pscore,Performance Score,Sum of 9 indicator variables to form fundamental health score,, -,rd_sale,R&D to sales,R&D expense divided by sales (xrd/sale),, -,re,Revisions in analysts’ earnings forecasts,"Following Chan, Jegadeesh, and Lakonishok (1996), we measure earnings surprise as the revisions in analysts’ forecasts of earnings obtained from the Institutional Brokers’ Estimate System (IBES). Because analysts’ forecasts are not necessarily revised each month, we construct a six-month moving average of past changes in analysts’ forecasts: RE_{it}=\sum_{\iota=1}^6\frac{",, -,rsup,Revenue surprise,Sales from quarter t minus sales from quarter t-4 (saleq) divided by fiscal-quarter-end market capitalization (cshoq * prccq),, -,rvar_capm,Residual variance - CAPM rolling 3m,Daily Stock residual variance of CAPM,, -,rvar_ff3,Residual variance - ff3 rolling 3m,Daily Stock residual variance of Fama French 3 factors,, -,rvar_mean,return variance rolling 3m,Daily Stock return variance,, -,sgr,Sales growth,Annual percent change in sales (sale),, -,std_dolvol,Std of dollar trading volume rolling 3m,Monthly standard deviation of daily dollar trading volume,, -,std_turn,Std. of Share turnover rolling 3m,Monthly standard deviation of daily share turnover,, -,sue,Unexpected quarterly earnings,"Unexpected quarterly earnings divided by fiscal-quarter-end market cap. Unexpected earnings is I/B/E/S actual earnings minus median forecasted earnings if available, else it is the seasonally differenced quarterly earnings before extraordinary items from Compustat quarterly file",, -,turn,Shares turnover,Average monthly trading volume for most recent 3 months scaled by number of shares outstanding in current month,, -,zerotrade,Number of zero-trading days rolling 3m,Turnover weighted number of zero trading days for most recent 1 month,, \ No newline at end of file +Num,Acronym,Description,Author,Pub Year,Category,,, +A.2.1,Bm,Book-to-market equity,"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth,,, +A.2.2,Bmj,Book-to-June-end market equity, Asness and Frazzini ,2013,Value-versus-growth,,, +A.2.3,Bmq12,Quarterly Book-to-market Equity (12-month holding period),"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth,,, +A.2.9,Ep,Earnings-to-price,Basu,1983,Value-versus-growth,,,/Users/zhulingqiao/Downloads/chars_list_20201102.csv +A.2.10,"Ep q 1, Ep q 6, and Ep q 12","Q Quarterly Earnings-to-price(1-month holding period), uarterly Earnings-to-price(6-month holding period), Quarterly Earnings-to-price(12-month holding period), ",Basu,1983,Value-versus-growth,,, +A.2.12,Cp,Cash flow-to-price,"Lakonishok, Shleifer, and Vishny ",1994,Value-versus-growth,,, +A.2.13,"Cpq1, Cpq6, Cpq12","Quarterly Cash Flow-to-price (1-month holding period), Quarterly Cash Flow-to-price (6-month holding period), Quarterly Cash Flow-to-price (12-month holding period)","Lakonishok, Shleifer, and Vishny",1994,Value-versus-growth,,, +A.2.14,Dp(dy),Dividend yield,Litzenberger and Ramaswamy,1979,Value-versus-growth,,, +A.2.16,Op and Nop, (Net) Payout Yield,"Richardson, and Roberts",2007,Value-versus-growth,,, +A.2.20,Em,Enterprise multiple,Loughran and Wellman,2011,Value-versus-growth,,, +A.2.21,"Emq1, Emq6","Quarterly Enterprise multiple (1-month holding period), Quarterly Enterprise multiple (6-month holding period)",Loughran and Wellman,2011,Value-versus-growth,,, +A.2.22,Sp,Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth,,, +A.2.23,"Sp q 1, Sp q 6, and Sp q 12",Quarterly Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth,,, +A.2.24,Ocp,Operating Cash Flow-to-price,"Desai, Rajgopal, and Venkatachalam",2004,Value-versus-growth,,, +A.2.26,Ir,Intangible Return,Daniel and Titman,2006,Value-versus-growth,,, +A.2.28,Ebp,Enterprise Book-to-price,"Penman, Richardson, and Tuna",2007,Value-versus-growth,,, +A.3.1,Aci,Abnormal Corporate Investment,"Titman, Wei, and Xie",2004,Investment,,, +A.3.2,I/A, Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment,,, +A.3.3,"Ia q 6, and Ia q 12",Quarterly Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment,,, +A.3.4,dPia,Changes in PPE and Inventory-to-assets,"Lyandres, Sun, and Zhang",2008,Investment,,, +A.3.5,Noa and dNoa,(Changes in) Net Operating Assets,"Hirshleifer, Hou, Teoh, and Zhang",2004,Investment,,, +A.3.6,dLno,Changes in Long-term Net Operating Assets,"Fairfield, Whisenant, and Yohn",2003,Investment,,, +A.3.7,Ig,Investment Growth,Xing,2008,Investment,,, +A.3.8,2Ig,2-year Investment Growth,Anderson and Garcia-Feijoo,2006,Investment,,, +A.3.10,Nsi,Net Stock Issues,Pontiff and Woodgate,2008,Investment,,, +A.3.11,dIi,% Change in Investment - % Change in Industry Investment,Abarbanell and Bushee,1998,Investment,,, +A.3.14,Ivg,Inventory Growth,Belo and Lin,2011,Investment,,, +A.3.15,Ivc,Inventory Changes,Thomas and Zhang,2002,Investment,,, +A.3.16,Oa(acc),Operating Accruals,Sloan,1996,Investment,,, +A.3.17,Ta,Total Accruals,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, +A.3.18,dCoa,changes in Current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, +A.3.19,dNca,changes in Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, +A.3.19,dNco,Changes in Net Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, +A.3.20,dFin,Changes in Net Financial Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, +A.3.20,dFnl,changes in Financial Liabilities,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, +A.3.20,dBe,changes in Book Equity,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, +A.3.22,Poa,Percent operating accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment,,, +A.3.23,Pta,Percent total accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment,,, +A.3.24,Pda,Percent discretionary accruals,,,Investment,,, +A.3.25,Ndf,Net debt finance,"Bradshaw, Richardson, and Sloan",2006,Investment,,, +A.4.1,"Roe1, Roe6",Return on Equity,"Hou, Xue, and Zhang",2015,Profitability,,, +A.4.2,"dRoe1, dRoe6, and dRoe12",Changes in Return on Equity,"Hou, Xue, and Zhang",2015,Profitability,,, +A.4.3,Roa1,Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability,,, +A.4.4,"dRoa1, dRoa6",Changes in Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability,,, +A.4.5,Ato,Asset Turnover,Soliman,2008,Profitability,,, +A.4.6,Cto,Capital Turnover,Haugen and Baker,1996,Profitability,,, +A.4.7,"Rna q 1, Rna q 6, Atoq1","Quarterly Return on Net Operating Assets, Quarterly Asset Turnover",Soliman,2008,Profitability,,, +A.4.8,"Cto q1, Cto q6,",Quarterly Capital Turnover,Haugen and Baker,1996,Profitability,,, +A.4.9,Gpa,Gross Profits-to-assets,Novy-Marx,2013,Profitability,,, +A.4.11,"Gla q 1, Gla q 6, and Gla q 12",Quarterly Gross Profits-to-lagged Assets,,,Profitability,,, +A.4.12,Ope,Operating Profits to Equity,Fama and French,2015,Profitability,,, +A.4.14,"Ole q 1, Ole q 6 ",Quarterly Operating Profits-to-lagged Equity,,,Profitability,,, +A.4.15,Opa,Operating Profits-to-assets,"Linnainmaa, and Nikolaev",2015,Profitability,,, +A.4.17,"Ola q 1, Ola q 6, and Ola q 12",Quarterly Operating Profits-to-lagged Assets,,,Profitability,,, +A.4.18,Cop,Cash-based Operating Profitability,"Gerakos, Linnainmaa, and Nikolaev",2016,Profitability,,, +A.4.19,Cla,Cash-based Operating Profits-to-lagged Assets,,,Profitability,,, +A.4.20,Claq,Quarterly Cash-based Operating Profits-to-lagged Assets,,,Profitability,,, +A.4.29,Tbi q 12,Quarterly Taxable Income-to-book Income,"Green, Hand, and Zhang",2013,Profitability,,, +A5.1,Oca and Ioca,(Industry-adjusted) Organizational Capital-to-assets,Eisfeldt and Papanikolaou,2013,Intangibles,,, +A.5.2,Adm,Advertising Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,, +A.5.4,Rdm,R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,, +A.5.5,"Rdm q 1, Rdm q 6, and Rdm q 12",Quarterly R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,, +A.5.6,Rds q 6 and Rds q 12,Quarterly R&D Expense-to-sales,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,, +A.5.8,Ol,Operating Leverage,Novy-Marx,2011,Intangibles,,, +A.5.9,"Ol q 1, Ol q 6, and Ol q 12",Quarterly Operating Leverage,Novy-Marx,2011,Intangibles,,, +A.5.11,Rca,R&D Capital-to-assets,Li,2011,Intangibles,,, +A.5.24,Etr,Effective Tax Rate,Abarbanell and Bushee,1998,Intangibles,,, +A.5.46,"Alm q 1, Alm q 6, and Alm q 12",Quarterly Asset Liquidity,Ortiz-Molina and Phillips,2014,Intangibles,,, +A.5.50,"R a 1 , R n 1 , R a [2,5] , R n[2,5] , R a[6,10] , R n[6,10] , R a[11,15] , and R a[16,20]",Seasonality,Heston and Sadka,2008,Intangibles,,, +A.6.8,Beta1,Market Beta,Fama and MacBeth,1973,Frictions,,, +A.6.13,Dtv12,"dollar trading volume, 12-month holding period","Brennan, Chordia, and Subrahmanyam",1998,Frictions,,, +A.6.21,Isff1,"idiosyncratic skewness estimated from the Fama-French 3-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions,,, +A.6.22,Isq1,"idiosyncratic skewness estimated from the q-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions,,, +A.6.3,Ivff1,"idiosyncratic volatility estimated from the Fama-French 3-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,, +A.6.5,Ivq1,"idiosyncratic volatility estimated from the q-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,, +A.6.1,Me,the market equity,Banz,1981,Frictions,,, +A.6.24,Srev,short-term reversal,Jegadeesh,1990,Frictions,,, +A.6.7,Sv1,"systematic volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,, +A.6.6,Tv1,"total volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,, +A.1.2,Abr1,"cumulative abnormal returns around earnings announcement dates, 1-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,, +A.1.2,Abr6,"cumulative abnormal returns around earnings announcement dates, 6-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,, +A.1.2,Abr12,"cumulative abnormal returns around earnings announcement dates, 12-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,, +,agr,Asset growth,"Cooper, Gulen & Schill",2008,,,, +,baspread,Bid-ask spread rolling 3m,Amihud & Mendelson,1989,,,, +,beta,Beta rolling 3m,Fama & MacBeth,1973,,,, +,bm_ia,Industry-adjusted book to market,"Asness, Porter & Stevens",2000,,,, +,cash,Cash holdings,Palazzo,2012,,,, +,cashdebt,Cash flow to debt,Ou & Penman,1989,,,, +,chcsho,Change in shares outstanding,Pontiff & Woodgate,2008,,,, +,chpm(chpmia),Industry-adjusted change in profit margin,Soliman,2008,,,, +,chtx,Change in tax expense,Thomas & Zhang,2011,,,, +,cinvest,Corporate investment,"Titman, Wei & Xie",2004,,,, +,depr,Depreciation / PP&E,Holthausen & Larcker,1992,,,, +,dolvol,Dollar trading volume,"Chordia, Subrahmanyam & Anshuman",2001,,,, +,gma,Gross profitability,Novy-Marx,2013,,,, +,grltnoa,Growth in long-term net operating assets,"Fairfield, Whisenant & Yohn",2003,,,, +,herf,Industry sales concentration,Hou & Robinson,2006,,,, +,hire,Employee growth rate,"Bazdresch, Belo & Lin",2014,,,, +,ill,Illiquidity rolling 3m,Amihud,2002,,,, +,lev,Leverage,Bhandari,1988,,,, +,lgr,Growth in long-term debt,"Richardson, Sloan, Soliman & Tuna",2005,,,, +,maxret,Maximum daily returns rolling 3m,"Bali, Cakici & Whitelaw",2011,,,, +,me_ia(mve_ia),Industry-adjusted size,"Asness, Porter & Stevens",2000,,,, +,mom12m,Momentum rolling 12m,Jegadeesh,1990,,,, +,mom1m,Momentum ,Jegadeesh & Titman,1993,,,, +,mom36m,Momentum rolling 36m,Jegadeesh & Titman,1993,,,, +,mom60m,Momentum rolling 60m,Jegadeesh & Titman,1993,,,, +,mom6m,Momentum rolling 6m,Jegadeesh & Titman,1993,,,, +,nincr,Number of earnings increases,"Barth, Elliott & Finn",1999,,,, +,op(operprof),Operating profitability,Fama and French,2015,,,, +A.4.5,pm,profit margin,Soliman,2008,Profitability,,, +,pscore(ps),Performance Score,Piotroski,2000,,,, +,rd_sale,R&D to sales,"Guo, Lev & Shi",2006,,,, +,re,Revisions in analysts’ earnings forecasts,"Chan, Jegadeesh, and Lakonishok",1996,,,, +,rsup,Revenue surprise,Kama,2009,,,, +,rvar_capm,Residual variance - CAPM rolling 3m,Daily Stock residual variance of CAPM,,,,, +,rvar_ff3,Residual variance - ff3 rolling 3m,Daily Stock residual variance of Fama French 3 factors,,,,, +,rvar_mean,return variance rolling 3m,Daily Stock return variance,,,,, +,sgr,Sales growth,"Lakonishok, Shleifer & Vishny",1994,,,, +,std_dolvol,Std of dollar trading volume rolling 3m,"Chordia, Subrahmanyam & Anshuman",2001,,,, +,std_turn,Std. of Share turnover rolling 3m,"Chordia, Subrahmanyam, &Anshuman",2001,,,, +,sue,Unexpected quarterly earnings,"Rendelman, Jones & Latane",1982,,,, +,turn,Shares turnover,"Datar, Naik & Radcliffe",1998,,,, +,zerotrade,Number of zero-trading days rolling 3m,Liu,2006,,,, \ No newline at end of file From 49dac6051ddf58e175e8b50fe470ceaae6b9f9fa Mon Sep 17 00:00:00 2001 From: velonisa Date: Wed, 11 Nov 2020 16:39:49 +0800 Subject: [PATCH 06/15] Update Chars_Description_1011.csv --- Chars_Description_1011.csv | 250 ++++++++++++++++++------------------- 1 file changed, 125 insertions(+), 125 deletions(-) diff --git a/Chars_Description_1011.csv b/Chars_Description_1011.csv index 870f9da..18af30e 100755 --- a/Chars_Description_1011.csv +++ b/Chars_Description_1011.csv @@ -1,125 +1,125 @@ -Num,Acronym,Description,Author,Pub Year,Category,,, -A.2.1,Bm,Book-to-market equity,"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth,,, -A.2.2,Bmj,Book-to-June-end market equity, Asness and Frazzini ,2013,Value-versus-growth,,, -A.2.3,Bmq12,Quarterly Book-to-market Equity (12-month holding period),"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth,,, -A.2.9,Ep,Earnings-to-price,Basu,1983,Value-versus-growth,,,/Users/zhulingqiao/Downloads/chars_list_20201102.csv -A.2.10,"Ep q 1, Ep q 6, and Ep q 12","Q Quarterly Earnings-to-price(1-month holding period), uarterly Earnings-to-price(6-month holding period), Quarterly Earnings-to-price(12-month holding period), ",Basu,1983,Value-versus-growth,,, -A.2.12,Cp,Cash flow-to-price,"Lakonishok, Shleifer, and Vishny ",1994,Value-versus-growth,,, -A.2.13,"Cpq1, Cpq6, Cpq12","Quarterly Cash Flow-to-price (1-month holding period), Quarterly Cash Flow-to-price (6-month holding period), Quarterly Cash Flow-to-price (12-month holding period)","Lakonishok, Shleifer, and Vishny",1994,Value-versus-growth,,, -A.2.14,Dp(dy),Dividend yield,Litzenberger and Ramaswamy,1979,Value-versus-growth,,, -A.2.16,Op and Nop, (Net) Payout Yield,"Richardson, and Roberts",2007,Value-versus-growth,,, -A.2.20,Em,Enterprise multiple,Loughran and Wellman,2011,Value-versus-growth,,, -A.2.21,"Emq1, Emq6","Quarterly Enterprise multiple (1-month holding period), Quarterly Enterprise multiple (6-month holding period)",Loughran and Wellman,2011,Value-versus-growth,,, -A.2.22,Sp,Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth,,, -A.2.23,"Sp q 1, Sp q 6, and Sp q 12",Quarterly Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth,,, -A.2.24,Ocp,Operating Cash Flow-to-price,"Desai, Rajgopal, and Venkatachalam",2004,Value-versus-growth,,, -A.2.26,Ir,Intangible Return,Daniel and Titman,2006,Value-versus-growth,,, -A.2.28,Ebp,Enterprise Book-to-price,"Penman, Richardson, and Tuna",2007,Value-versus-growth,,, -A.3.1,Aci,Abnormal Corporate Investment,"Titman, Wei, and Xie",2004,Investment,,, -A.3.2,I/A, Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment,,, -A.3.3,"Ia q 6, and Ia q 12",Quarterly Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment,,, -A.3.4,dPia,Changes in PPE and Inventory-to-assets,"Lyandres, Sun, and Zhang",2008,Investment,,, -A.3.5,Noa and dNoa,(Changes in) Net Operating Assets,"Hirshleifer, Hou, Teoh, and Zhang",2004,Investment,,, -A.3.6,dLno,Changes in Long-term Net Operating Assets,"Fairfield, Whisenant, and Yohn",2003,Investment,,, -A.3.7,Ig,Investment Growth,Xing,2008,Investment,,, -A.3.8,2Ig,2-year Investment Growth,Anderson and Garcia-Feijoo,2006,Investment,,, -A.3.10,Nsi,Net Stock Issues,Pontiff and Woodgate,2008,Investment,,, -A.3.11,dIi,% Change in Investment - % Change in Industry Investment,Abarbanell and Bushee,1998,Investment,,, -A.3.14,Ivg,Inventory Growth,Belo and Lin,2011,Investment,,, -A.3.15,Ivc,Inventory Changes,Thomas and Zhang,2002,Investment,,, -A.3.16,Oa(acc),Operating Accruals,Sloan,1996,Investment,,, -A.3.17,Ta,Total Accruals,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, -A.3.18,dCoa,changes in Current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, -A.3.19,dNca,changes in Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, -A.3.19,dNco,Changes in Net Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, -A.3.20,dFin,Changes in Net Financial Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, -A.3.20,dFnl,changes in Financial Liabilities,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, -A.3.20,dBe,changes in Book Equity,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,,, -A.3.22,Poa,Percent operating accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment,,, -A.3.23,Pta,Percent total accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment,,, -A.3.24,Pda,Percent discretionary accruals,,,Investment,,, -A.3.25,Ndf,Net debt finance,"Bradshaw, Richardson, and Sloan",2006,Investment,,, -A.4.1,"Roe1, Roe6",Return on Equity,"Hou, Xue, and Zhang",2015,Profitability,,, -A.4.2,"dRoe1, dRoe6, and dRoe12",Changes in Return on Equity,"Hou, Xue, and Zhang",2015,Profitability,,, -A.4.3,Roa1,Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability,,, -A.4.4,"dRoa1, dRoa6",Changes in Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability,,, -A.4.5,Ato,Asset Turnover,Soliman,2008,Profitability,,, -A.4.6,Cto,Capital Turnover,Haugen and Baker,1996,Profitability,,, -A.4.7,"Rna q 1, Rna q 6, Atoq1","Quarterly Return on Net Operating Assets, Quarterly Asset Turnover",Soliman,2008,Profitability,,, -A.4.8,"Cto q1, Cto q6,",Quarterly Capital Turnover,Haugen and Baker,1996,Profitability,,, -A.4.9,Gpa,Gross Profits-to-assets,Novy-Marx,2013,Profitability,,, -A.4.11,"Gla q 1, Gla q 6, and Gla q 12",Quarterly Gross Profits-to-lagged Assets,,,Profitability,,, -A.4.12,Ope,Operating Profits to Equity,Fama and French,2015,Profitability,,, -A.4.14,"Ole q 1, Ole q 6 ",Quarterly Operating Profits-to-lagged Equity,,,Profitability,,, -A.4.15,Opa,Operating Profits-to-assets,"Linnainmaa, and Nikolaev",2015,Profitability,,, -A.4.17,"Ola q 1, Ola q 6, and Ola q 12",Quarterly Operating Profits-to-lagged Assets,,,Profitability,,, -A.4.18,Cop,Cash-based Operating Profitability,"Gerakos, Linnainmaa, and Nikolaev",2016,Profitability,,, -A.4.19,Cla,Cash-based Operating Profits-to-lagged Assets,,,Profitability,,, -A.4.20,Claq,Quarterly Cash-based Operating Profits-to-lagged Assets,,,Profitability,,, -A.4.29,Tbi q 12,Quarterly Taxable Income-to-book Income,"Green, Hand, and Zhang",2013,Profitability,,, -A5.1,Oca and Ioca,(Industry-adjusted) Organizational Capital-to-assets,Eisfeldt and Papanikolaou,2013,Intangibles,,, -A.5.2,Adm,Advertising Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,, -A.5.4,Rdm,R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,, -A.5.5,"Rdm q 1, Rdm q 6, and Rdm q 12",Quarterly R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,, -A.5.6,Rds q 6 and Rds q 12,Quarterly R&D Expense-to-sales,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,, -A.5.8,Ol,Operating Leverage,Novy-Marx,2011,Intangibles,,, -A.5.9,"Ol q 1, Ol q 6, and Ol q 12",Quarterly Operating Leverage,Novy-Marx,2011,Intangibles,,, -A.5.11,Rca,R&D Capital-to-assets,Li,2011,Intangibles,,, -A.5.24,Etr,Effective Tax Rate,Abarbanell and Bushee,1998,Intangibles,,, -A.5.46,"Alm q 1, Alm q 6, and Alm q 12",Quarterly Asset Liquidity,Ortiz-Molina and Phillips,2014,Intangibles,,, -A.5.50,"R a 1 , R n 1 , R a [2,5] , R n[2,5] , R a[6,10] , R n[6,10] , R a[11,15] , and R a[16,20]",Seasonality,Heston and Sadka,2008,Intangibles,,, -A.6.8,Beta1,Market Beta,Fama and MacBeth,1973,Frictions,,, -A.6.13,Dtv12,"dollar trading volume, 12-month holding period","Brennan, Chordia, and Subrahmanyam",1998,Frictions,,, -A.6.21,Isff1,"idiosyncratic skewness estimated from the Fama-French 3-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions,,, -A.6.22,Isq1,"idiosyncratic skewness estimated from the q-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions,,, -A.6.3,Ivff1,"idiosyncratic volatility estimated from the Fama-French 3-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,, -A.6.5,Ivq1,"idiosyncratic volatility estimated from the q-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,, -A.6.1,Me,the market equity,Banz,1981,Frictions,,, -A.6.24,Srev,short-term reversal,Jegadeesh,1990,Frictions,,, -A.6.7,Sv1,"systematic volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,, -A.6.6,Tv1,"total volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,, -A.1.2,Abr1,"cumulative abnormal returns around earnings announcement dates, 1-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,, -A.1.2,Abr6,"cumulative abnormal returns around earnings announcement dates, 6-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,, -A.1.2,Abr12,"cumulative abnormal returns around earnings announcement dates, 12-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,, -,agr,Asset growth,"Cooper, Gulen & Schill",2008,,,, -,baspread,Bid-ask spread rolling 3m,Amihud & Mendelson,1989,,,, -,beta,Beta rolling 3m,Fama & MacBeth,1973,,,, -,bm_ia,Industry-adjusted book to market,"Asness, Porter & Stevens",2000,,,, -,cash,Cash holdings,Palazzo,2012,,,, -,cashdebt,Cash flow to debt,Ou & Penman,1989,,,, -,chcsho,Change in shares outstanding,Pontiff & Woodgate,2008,,,, -,chpm(chpmia),Industry-adjusted change in profit margin,Soliman,2008,,,, -,chtx,Change in tax expense,Thomas & Zhang,2011,,,, -,cinvest,Corporate investment,"Titman, Wei & Xie",2004,,,, -,depr,Depreciation / PP&E,Holthausen & Larcker,1992,,,, -,dolvol,Dollar trading volume,"Chordia, Subrahmanyam & Anshuman",2001,,,, -,gma,Gross profitability,Novy-Marx,2013,,,, -,grltnoa,Growth in long-term net operating assets,"Fairfield, Whisenant & Yohn",2003,,,, -,herf,Industry sales concentration,Hou & Robinson,2006,,,, -,hire,Employee growth rate,"Bazdresch, Belo & Lin",2014,,,, -,ill,Illiquidity rolling 3m,Amihud,2002,,,, -,lev,Leverage,Bhandari,1988,,,, -,lgr,Growth in long-term debt,"Richardson, Sloan, Soliman & Tuna",2005,,,, -,maxret,Maximum daily returns rolling 3m,"Bali, Cakici & Whitelaw",2011,,,, -,me_ia(mve_ia),Industry-adjusted size,"Asness, Porter & Stevens",2000,,,, -,mom12m,Momentum rolling 12m,Jegadeesh,1990,,,, -,mom1m,Momentum ,Jegadeesh & Titman,1993,,,, -,mom36m,Momentum rolling 36m,Jegadeesh & Titman,1993,,,, -,mom60m,Momentum rolling 60m,Jegadeesh & Titman,1993,,,, -,mom6m,Momentum rolling 6m,Jegadeesh & Titman,1993,,,, -,nincr,Number of earnings increases,"Barth, Elliott & Finn",1999,,,, -,op(operprof),Operating profitability,Fama and French,2015,,,, -A.4.5,pm,profit margin,Soliman,2008,Profitability,,, -,pscore(ps),Performance Score,Piotroski,2000,,,, -,rd_sale,R&D to sales,"Guo, Lev & Shi",2006,,,, -,re,Revisions in analysts’ earnings forecasts,"Chan, Jegadeesh, and Lakonishok",1996,,,, -,rsup,Revenue surprise,Kama,2009,,,, -,rvar_capm,Residual variance - CAPM rolling 3m,Daily Stock residual variance of CAPM,,,,, -,rvar_ff3,Residual variance - ff3 rolling 3m,Daily Stock residual variance of Fama French 3 factors,,,,, -,rvar_mean,return variance rolling 3m,Daily Stock return variance,,,,, -,sgr,Sales growth,"Lakonishok, Shleifer & Vishny",1994,,,, -,std_dolvol,Std of dollar trading volume rolling 3m,"Chordia, Subrahmanyam & Anshuman",2001,,,, -,std_turn,Std. of Share turnover rolling 3m,"Chordia, Subrahmanyam, &Anshuman",2001,,,, -,sue,Unexpected quarterly earnings,"Rendelman, Jones & Latane",1982,,,, -,turn,Shares turnover,"Datar, Naik & Radcliffe",1998,,,, -,zerotrade,Number of zero-trading days rolling 3m,Liu,2006,,,, \ No newline at end of file +Num,Acronym,Description,Author,Pub Year,Category +A.2.1,Bm,Book-to-market equity,"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth +A.2.2,Bmj,Book-to-June-end market equity, Asness and Frazzini ,2013,Value-versus-growth +A.2.3,Bmq12,Quarterly Book-to-market Equity (12-month holding period),"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth +A.2.9,Ep,Earnings-to-price,Basu,1983,Value-versus-growth +A.2.10,"Ep q 1, Ep q 6, and Ep q 12","Q Quarterly Earnings-to-price(1-month holding period), uarterly Earnings-to-price(6-month holding period), Quarterly Earnings-to-price(12-month holding period), ",Basu,1983,Value-versus-growth +A.2.12,Cp,Cash flow-to-price,"Lakonishok, Shleifer, and Vishny ",1994,Value-versus-growth +A.2.13,"Cpq1, Cpq6, Cpq12","Quarterly Cash Flow-to-price (1-month holding period), Quarterly Cash Flow-to-price (6-month holding period), Quarterly Cash Flow-to-price (12-month holding period)","Lakonishok, Shleifer, and Vishny",1994,Value-versus-growth +A.2.14,Dp(dy),Dividend yield,Litzenberger and Ramaswamy,1979,Value-versus-growth +A.2.16,Op and Nop, (Net) Payout Yield,"Richardson, and Roberts",2007,Value-versus-growth +A.2.20,Em,Enterprise multiple,Loughran and Wellman,2011,Value-versus-growth +A.2.21,"Emq1, Emq6","Quarterly Enterprise multiple (1-month holding period), Quarterly Enterprise multiple (6-month holding period)",Loughran and Wellman,2011,Value-versus-growth +A.2.22,Sp,Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth +A.2.23,"Sp q 1, Sp q 6, and Sp q 12",Quarterly Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth +A.2.24,Ocp,Operating Cash Flow-to-price,"Desai, Rajgopal, and Venkatachalam",2004,Value-versus-growth +A.2.26,Ir,Intangible Return,Daniel and Titman,2006,Value-versus-growth +A.2.28,Ebp,Enterprise Book-to-price,"Penman, Richardson, and Tuna",2007,Value-versus-growth +A.3.1,Aci,Abnormal Corporate Investment,"Titman, Wei, and Xie",2004,Investment +A.3.2,I/A, Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment +A.3.3,"Ia q 6, and Ia q 12",Quarterly Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment +A.3.4,dPia,Changes in PPE and Inventory-to-assets,"Lyandres, Sun, and Zhang",2008,Investment +A.3.5,Noa and dNoa,(Changes in) Net Operating Assets,"Hirshleifer, Hou, Teoh, and Zhang",2004,Investment +A.3.6,dLno,Changes in Long-term Net Operating Assets,"Fairfield, Whisenant, and Yohn",2003,Investment +A.3.7,Ig,Investment Growth,Xing,2008,Investment +A.3.8,2Ig,2-year Investment Growth,Anderson and Garcia-Feijoo,2006,Investment +A.3.10,Nsi,Net Stock Issues,Pontiff and Woodgate,2008,Investment +A.3.11,dIi,% Change in Investment - % Change in Industry Investment,Abarbanell and Bushee,1998,Investment +A.3.14,Ivg,Inventory Growth,Belo and Lin,2011,Investment +A.3.15,Ivc,Inventory Changes,Thomas and Zhang,2002,Investment +A.3.16,Oa(acc),Operating Accruals,Sloan,1996,Investment +A.3.17,Ta,Total Accruals,"Richardson, Sloan, Soliman, and Tuna",2005,Investment +A.3.18,dCoa,changes in Current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment +A.3.19,dNca,changes in Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment +A.3.19,dNco,Changes in Net Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment +A.3.20,dFin,Changes in Net Financial Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment +A.3.20,dFnl,changes in Financial Liabilities,"Richardson, Sloan, Soliman, and Tuna",2005,Investment +A.3.20,dBe,changes in Book Equity,"Richardson, Sloan, Soliman, and Tuna",2005,Investment +A.3.22,Poa,Percent operating accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment +A.3.23,Pta,Percent total accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment +A.3.24,Pda,Percent discretionary accruals,,,Investment +A.3.25,Ndf,Net debt finance,"Bradshaw, Richardson, and Sloan",2006,Investment +A.4.1,"Roe1, Roe6",Return on Equity,"Hou, Xue, and Zhang",2015,Profitability +A.4.2,"dRoe1, dRoe6, and dRoe12",Changes in Return on Equity,"Hou, Xue, and Zhang",2015,Profitability +A.4.3,Roa1,Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability +A.4.4,"dRoa1, dRoa6",Changes in Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability +A.4.5,Ato,Asset Turnover,Soliman,2008,Profitability +A.4.6,Cto,Capital Turnover,Haugen and Baker,1996,Profitability +A.4.7,"Rna q 1, Rna q 6, Atoq1","Quarterly Return on Net Operating Assets, Quarterly Asset Turnover",Soliman,2008,Profitability +A.4.8,"Cto q1, Cto q6,",Quarterly Capital Turnover,Haugen and Baker,1996,Profitability +A.4.9,Gpa,Gross Profits-to-assets,Novy-Marx,2013,Profitability +A.4.11,"Gla q 1, Gla q 6, and Gla q 12",Quarterly Gross Profits-to-lagged Assets,,,Profitability +A.4.12,Ope,Operating Profits to Equity,Fama and French,2015,Profitability +A.4.14,"Ole q 1, Ole q 6 ",Quarterly Operating Profits-to-lagged Equity,,,Profitability +A.4.15,Opa,Operating Profits-to-assets,"Linnainmaa, and Nikolaev",2015,Profitability +A.4.17,"Ola q 1, Ola q 6, and Ola q 12",Quarterly Operating Profits-to-lagged Assets,,,Profitability +A.4.18,Cop,Cash-based Operating Profitability,"Gerakos, Linnainmaa, and Nikolaev",2016,Profitability +A.4.19,Cla,Cash-based Operating Profits-to-lagged Assets,,,Profitability +A.4.20,Claq,Quarterly Cash-based Operating Profits-to-lagged Assets,,,Profitability +A.4.29,Tbi q 12,Quarterly Taxable Income-to-book Income,"Green, Hand, and Zhang",2013,Profitability +A.5.1,Oca and Ioca,(Industry-adjusted) Organizational Capital-to-assets,Eisfeldt and Papanikolaou,2013,Intangibles +A.5.2,Adm,Advertising Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles +A.5.4,Rdm,R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles +A.5.5,"Rdm q 1, Rdm q 6, and Rdm q 12",Quarterly R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles +A.5.6,Rds q 6 and Rds q 12,Quarterly R&D Expense-to-sales,"Chan, Lakonishok, and Sougiannis",2001,Intangibles +A.5.8,Ol,Operating Leverage,Novy-Marx,2011,Intangibles +A.5.9,"Ol q 1, Ol q 6, and Ol q 12",Quarterly Operating Leverage,Novy-Marx,2011,Intangibles +A.5.11,Rca,R&D Capital-to-assets,Li,2011,Intangibles +A.5.24,Etr,Effective Tax Rate,Abarbanell and Bushee,1998,Intangibles +A.5.46,"Alm q 1, Alm q 6, and Alm q 12",Quarterly Asset Liquidity,Ortiz-Molina and Phillips,2014,Intangibles +A.5.50,"R a 1 , R n 1 , R a [2,5] , R n[2,5] , R a[6,10] , R n[6,10] , R a[11,15] , and R a[16,20]",Seasonality,Heston and Sadka,2008,Intangibles +A.6.8,Beta1,Market Beta,Fama and MacBeth,1973,Frictions +A.6.13,Dtv12,"dollar trading volume, 12-month holding period","Brennan, Chordia, and Subrahmanyam",1998,Frictions +A.6.21,Isff1,"idiosyncratic skewness estimated from the Fama-French 3-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions +A.6.22,Isq1,"idiosyncratic skewness estimated from the q-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions +A.6.3,Ivff1,"idiosyncratic volatility estimated from the Fama-French 3-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions +A.6.5,Ivq1,"idiosyncratic volatility estimated from the q-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions +A.6.1,Me,the market equity,Banz,1981,Frictions +A.6.24,Srev,short-term reversal,Jegadeesh,1990,Frictions +A.6.7,Sv1,"systematic volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions +A.6.6,Tv1,"total volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions +A.1.2,Abr1,"cumulative abnormal returns around earnings announcement dates, 1-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum +A.1.2,Abr6,"cumulative abnormal returns around earnings announcement dates, 6-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum +A.1.2,Abr12,"cumulative abnormal returns around earnings announcement dates, 12-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum +,agr,Asset growth,"Cooper, Gulen & Schill",2008, +,baspread,Bid-ask spread rolling 3m,Amihud & Mendelson,1989, +,beta,Beta rolling 3m,Fama & MacBeth,1973, +,bm_ia,Industry-adjusted book to market,"Asness, Porter & Stevens",2000, +,cash,Cash holdings,Palazzo,2012, +,cashdebt,Cash flow to debt,Ou & Penman,1989, +,chcsho,Change in shares outstanding,Pontiff & Woodgate,2008, +,chpm(chpmia),Industry-adjusted change in profit margin,Soliman,2008, +,chtx,Change in tax expense,Thomas & Zhang,2011, +,cinvest,Corporate investment,"Titman, Wei & Xie",2004, +,depr,Depreciation / PP&E,Holthausen & Larcker,1992, +,dolvol,Dollar trading volume,"Chordia, Subrahmanyam & Anshuman",2001, +,gma,Gross profitability,Novy-Marx,2013, +,grltnoa,Growth in long-term net operating assets,"Fairfield, Whisenant & Yohn",2003, +,herf,Industry sales concentration,Hou & Robinson,2006, +,hire,Employee growth rate,"Bazdresch, Belo & Lin",2014, +,ill,Illiquidity rolling 3m,Amihud,2002, +,lev,Leverage,Bhandari,1988, +,lgr,Growth in long-term debt,"Richardson, Sloan, Soliman & Tuna",2005, +,maxret,Maximum daily returns rolling 3m,"Bali, Cakici & Whitelaw",2011, +,me_ia(mve_ia),Industry-adjusted size,"Asness, Porter & Stevens",2000, +,mom12m,Momentum rolling 12m,Jegadeesh,1990, +,mom1m,Momentum ,Jegadeesh & Titman,1993, +,mom36m,Momentum rolling 36m,Jegadeesh & Titman,1993, +,mom60m,Momentum rolling 60m,Jegadeesh & Titman,1993, +,mom6m,Momentum rolling 6m,Jegadeesh & Titman,1993, +,nincr,Number of earnings increases,"Barth, Elliott & Finn",1999, +,op(operprof),Operating profitability,Fama and French,2015, +A.4.5,pm,profit margin,Soliman,2008,Profitability +,pscore(ps),Performance Score,Piotroski,2000, +,rd_sale,R&D to sales,"Guo, Lev & Shi",2006, +,re,Revisions in analysts’ earnings forecasts,"Chan, Jegadeesh, and Lakonishok",1996, +,rsup,Revenue surprise,Kama,2009, +,rvar_capm,Residual variance - CAPM rolling 3m,Daily Stock residual variance of CAPM,, +,rvar_ff3,Residual variance - ff3 rolling 3m,Daily Stock residual variance of Fama French 3 factors,, +,rvar_mean,return variance rolling 3m,Daily Stock return variance,, +,sgr,Sales growth,"Lakonishok, Shleifer & Vishny",1994, +,std_dolvol,Std of dollar trading volume rolling 3m,"Chordia, Subrahmanyam & Anshuman",2001, +,std_turn,Std. of Share turnover rolling 3m,"Chordia, Subrahmanyam, &Anshuman",2001, +,sue,Unexpected quarterly earnings,"Rendelman, Jones & Latane",1982, +,turn,Shares turnover,"Datar, Naik & Radcliffe",1998, +,zerotrade,Number of zero-trading days rolling 3m,Liu,2006, \ No newline at end of file From dc439e78ad9e83eb79fca5dfa8e8930bf438c6db Mon Sep 17 00:00:00 2001 From: velonisa Date: Wed, 11 Nov 2020 16:40:05 +0800 Subject: [PATCH 07/15] Update accounting_60.py --- char60/accounting_60.py | 104 ++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 35 deletions(-) diff --git a/char60/accounting_60.py b/char60/accounting_60.py index 99ff3b4..23c9df5 100644 --- a/char60/accounting_60.py +++ b/char60/accounting_60.py @@ -40,13 +40,13 @@ def ttm12(series, df): """ lag = pd.DataFrame() for i in range(1, 12): - lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('gvkey')['%s' % series].shift(i) + lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('permno')['%s' % series].shift(i) result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] +\ lag['%s4' % series] + lag['%s5' % series] + lag['%s6' % series] + lag['%s7' % series] +\ lag['%s8' % series] + lag['%s9' % series] + lag['%s10' % series] + lag['%s11' % series] return result - +print('TTM') ####################################################################################################################### # Compustat Block # ####################################################################################################################### @@ -67,7 +67,7 @@ def ttm12(series, df): /*liabilities*/ f.lct, f.dlc, f.dltt, f.lt, f.dm, f.dcvt, f.cshrc, - f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, f.dltis, f.dltr. f.dlcch, + f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, f.dltis, f.dltr, f.dlcch, /*equity and other*/ f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, @@ -122,7 +122,7 @@ def ttm12(series, df): comp['ceq'] = np.where(comp['ceq'] == 0, np.nan, comp['ceq']) comp['at'] = np.where(comp['at'] == 0, np.nan, comp['at']) comp = comp.dropna(subset=['at']) - +print('compustat') ####################################################################################################################### # CRSP Block # ####################################################################################################################### @@ -177,7 +177,7 @@ def ttm12(series, df): crsp2 = pd.merge(crsp1, crsp_summe, how='inner', on=['monthend', 'permco']) # sort by permno and date and also drop duplicates crsp2 = crsp2.sort_values(by=['permno', 'monthend']).drop_duplicates() - +print('crsp') ####################################################################################################################### # CCM Block # ####################################################################################################################### @@ -237,7 +237,7 @@ def ttm12(series, df): data_rawa = data_rawa[data_rawa['temp'].notna()] data_rawa = data_rawa.sort_values(by=['permno', 'jdate']) - +print('ccm') ####################################################################################################################### # Annual Variables # ####################################################################################################################### @@ -774,7 +774,7 @@ def ttm12(series, df): #ir ''' -First calculate r(t-5,t). Then rb(t-5,t) and use Bm to perform linear regression and get residue +#First calculate r(t-5,t). Then rb(t-5,t) and use Bm to perform linear regression and get residue ''' #r(t-5,t):sum ret from t-5 to t (which is calendar year t-6 to t-1) lag = pd.DataFrame() @@ -784,30 +784,30 @@ def ttm12(series, df): data_rawa['ret5'] = lag['ret1']+lag['ret2']+lag['ret3']+lag['ret4']+lag['ret5'] #bm_t-5 (bm of year t-5) -data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) +#data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) #rB (five year log book return) #Reference: jf_06 page8 by KENT DANIEL -data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] +#data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] #Regression and get ir #First get unique datelist -datelist = data_rawa['jdate'].unique() -for date in datelist: - temp = data_rawa['jdate' == date] - n_row = temp.shape[0] - index = temp.index - X = pd.DataFrame() - X['bm5'] = temp['bm5'] - X['rB'] = temp['rB'] - X['intercept'] = 1 - X = X[['intercept','rB','bm5']] - X = np.mat(X) - Y = np.mat(temp[['ret5']]) +#datelist = data_rawa['jdate'].unique() +#for date in datelist: +# temp = data_rawa['jdate' == date] +# n_row = temp.shape[0] +# index = temp.index +# X = pd.DataFrame() +# X['bm5'] = temp['bm5'] +# X['rB'] = temp['rB'] +# X['intercept'] = 1 +# X = X[['intercept','rB','bm5']] +# X = np.mat(X) +# Y = np.mat(temp[['ret5']]) #These are residuals on one date - res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) - #put residuals back into data_rawa - data_rawa.loc[index,'ir'] = res +# res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) +# #put residuals back into data_rawa +# data_rawa.loc[index,'ir'] = res #nop #data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] @@ -867,7 +867,7 @@ def ttm12(series, df): data_rawa['deps'] = data_rawa['epspx']/(data_rawa['ajex'] * data_rawa['prcc_f']) data_rawa['etr'] = (data_rawa['txtpi'] - (data_rawa['txtpi_l1'] + data_rawa['txtpi_l2'] + data_rawa['txtpi_l3'])/3) * data_rawa['deps'] - +print('annual') ####################################################################################################################### # Compustat Quarterly Raw Info # ####################################################################################################################### @@ -952,7 +952,7 @@ def ttm12(series, df): data_rawq = data_rawq[data_rawq['temp'].notna()] data_rawq = data_rawq.sort_values(by=['permno', 'jdate']) - +print('quarterly raw') ####################################################################################################################### # Quarterly Variables # ####################################################################################################################### @@ -1291,10 +1291,12 @@ def chars_std(start, end, df, chars): data_rawq['almq'] = data_rawq['qal']/data_rawq['mveqa_1'] #Olq, needs atq -data_rawa['olq'] = (data_rawa['cogsq'] + data_rawa['xsgaq'])/data_rawa['atq'] +data_rawq['olq'] = (data_rawq['cogsq'] + data_rawq['xsgaq'])/data_rawq['atq'] # rds data_rawq['rds'] = data_rawq['xrdq4']/data_rawq['saleq'] + +print('quarterly variables') ####################################################################################################################### # Momentum # ####################################################################################################################### @@ -1326,12 +1328,12 @@ def chars_std(start, end, df, chars): crsp_mom['retadj'] = (1 + crsp_mom['ret']) * (1 + crsp_mom['dlret']) - 1 crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity crsp_mom['retx'] = np.where(crsp_mom['me'].isnull(), 0, crsp_mom['retx']) -crsp_mom = crsp_mom.drop(['dlret', 'dlstdt', 'prc', 'shrout'], axis=1) +crsp_mom = crsp_mom.drop(['dlret', 'dlstdt'], axis=1)#delete prc,shrout #Seasonality #Rla -crsp_mom['rla'] = crsp_mom.groupby['permno']['ret'].shift(12) +crsp_mom['rla'] = crsp_mom.groupby(['permno'])['ret'].shift(12) #Rln lag = pd.DataFrame() @@ -1439,7 +1441,7 @@ def mom(start, end, df): # crsp_mom['moms12m'] = moms(1, 12, crsp_mom) # populate the chars to monthly - +print('momentum') # data_rawa data_rawa = data_rawa.drop(['date', 'ret', 'retx', 'me'], axis=1) data_rawa = pd.merge(crsp_mom, data_rawa, how='left', on=['permno', 'jdate']) @@ -1447,7 +1449,7 @@ def mom(start, end, df): data_rawa = data_rawa.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] - +print('data_rawa') # data_rawq data_rawq = data_rawq.drop(['date', 'ret', 'retx', 'me'], axis=1) data_rawq = pd.merge(crsp_mom, data_rawq, how='left', on=['permno', 'jdate']) @@ -1455,7 +1457,7 @@ def mom(start, end, df): data_rawq = data_rawq.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] - +print('data_rawq') ####################################################################################################################### # Monthly ME # ####################################################################################################################### @@ -1544,6 +1546,32 @@ def mom(start, end, df): data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) +#bm_t-5 (bm of year t-5) +data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) + +#rB (five year log book return) +#Reference: jf_06 page8 by KENT DANIEL +data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] + +#Regression and get ir +#First get unique datelist +datelist = data_rawa['jdate'].unique() +for date in datelist: + temp = data_rawa[data_rawa['jdate'] == date] + n_row = temp.shape[0] + index = temp.index + X = pd.DataFrame() + X['bm5'] = temp['bm5'] + X['rB'] = temp['rB'] + X['intercept'] = 1 + X = X[['intercept','rB','bm5']] + X = np.mat(X) + Y = np.mat(temp[['ret5']]) + #These are residuals on one date + res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + #put residuals back into data_rawa + data_rawa.loc[index,'ir'] = res + # Annual Accounting Variables chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', 'sic', 'retadj', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', 'rsup', 'cash', 'chcsho', @@ -1557,8 +1585,10 @@ def mom(start, end, df): 'me_ia', 'bmj','cp', 'ebp', 'em', 'dp', 'aci', 'dpia', 'dBe', 'dfnl', 'dfin', 'dcoa', 'dlno', 'dnoa', 'cla', 'cop', 'cto', 'dIi', 'dnco', 'dnca', 'ir', 'nop', 'ocp', 'ia', 'ig','2ig','ivc','ndf','nsi','oa','poa','ta','ol','etr']] -chars_a.reset_index(drop=True, inplace=True) +chars_a.reset_index(drop=True, inplace=True) +print(chars_a) +print('ME annual') ######################################## # Quarterly # ######################################## @@ -1600,10 +1630,14 @@ def mom(start, end, df): 'grltnoa', 'ala', 'alm', 'rsup', 'stdacc', 'sgrvol', 'roavol', 'scf', 'cinvest', 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'pscore', 'nincr', 'turn', 'dolvol', 'iaq', 'almq', 'olq', 'rds']] -chars_q.reset_index(drop=True, inplace=True) +chars_q.reset_index(drop=True, inplace=True) +print(chars_q) +print('ME quarterly') with open('chars_a_60.pkl', 'wb') as f: pkl.dump(chars_a, f) - +print('pkl a') with open('chars_q_60.pkl', 'wb') as f: pkl.dump(chars_q, f) +print('pkl q') +print('Finished') \ No newline at end of file From daffa4dca9f4e6826c973e548afdef4128096784 Mon Sep 17 00:00:00 2001 From: velonisa Date: Mon, 7 Dec 2020 13:42:56 +0800 Subject: [PATCH 08/15] Update accounting_60.py --- char60/accounting_60.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/char60/accounting_60.py b/char60/accounting_60.py index 23c9df5..86dcd07 100644 --- a/char60/accounting_60.py +++ b/char60/accounting_60.py @@ -1309,7 +1309,7 @@ def chars_std(start, end, df, chars): crsp_mom['permno'] = crsp_mom['permno'].astype(int) crsp_mom['date'] = pd.to_datetime(crsp_mom['date']) crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) -crsp_mom = crsp_mom.dropna() +crsp_mom = crsp_mom.dropna(subset=['ret', 'retx', 'prc']) # add delisting return dlret = conn.raw_sql(""" From da3bbde18266f139059ebef945ea57247bcc0fff Mon Sep 17 00:00:00 2001 From: velonisa Date: Mon, 1 Mar 2021 19:43:02 +0800 Subject: [PATCH 09/15] Update description --- .DS_Store | Bin 10244 -> 12292 bytes Chars_Description_0301.csv | 507 +++++++++++++++++++++++++++++++++++++ Chars_Description_1011.csv | 125 --------- char60/.DS_Store | Bin 0 -> 6148 bytes 4 files changed, 507 insertions(+), 125 deletions(-) create mode 100644 Chars_Description_0301.csv delete mode 100755 Chars_Description_1011.csv create mode 100644 char60/.DS_Store diff --git a/.DS_Store b/.DS_Store index 38d98e108eb2b425250d79235c22b216a8a978af..7329c78c8bf4823b0c71684345a5a253510ed259 100644 GIT binary patch literal 12292 zcmeHNYit}>6+XvtoVj^46XV1ir;k+~6Q^tvuN}XVrmXF?6Vs$E@heSBo!y@9cVZ zb`~m9rKZeCckawR=f2MQX3lr#4gla_QHuk#1Asz_=jvuUJq8fnM7ihSa-OP=byR|s zdMMGg3a}3}m<0zGM$Ibi>Vi5d;`Qq(ZGb#1dh1)6v}vVDLe0Xnl-gIs0kE<{ci|CQ zAhbYefzSe>1wsq_cNXBWmkz}j?`3It4lNK`;Q!hJ{QVH6#PfukFUwrKI_ShHJeP9{ zFAloqgq@c~-8|vu%Q7bk9Vn|(E~}!u#6Ve{%A>Ac!p)auE~^8Io~a$TXLMI6D0-*z zs89z?$UHoU76>g+ZGpX1Bc~TcWkb_R<-sWdj1Yx|MM3x)c`jJ zXnCFa*oEDXz9yX+sJZIWf3F$f*nswY?k&9^@}NTsG@onV@VVBfy~f=7mcLeAqXf9r zX9%x5leaef`i&n}yjITS^;kK^%m&D49t-O3t(5eur3wo!m{=nb74(?!x&?t){f>#}O}UDD?_g~FERi17cDvet zfJG+ER9ZJxjmdtx4zkG9yquDHw_DiP$RekRuw-c}wp(`Rkx1kr-tF;}?cAH!b2OwM zyL}JPknZQQ0ZrorQd$#ymxqqlM2Eak`KoIt#NaL%fzuSTa(8MCn;{q<@$M7zm6}+an+S*+%b7yI^(zL zNO&(S_P=QI1=EI=n$wW zp+dm*Dxq{|%Pw|H3B-PIPY}-Z^9qNxxf zl2@o{fSWaS3+0=ssdpxxC?K=cN5C?;vE>nZtTH6LK?@}Fpl6cd_SJVahxEu zoguVkP{SFtFpmYa33U(Shwu@66hDlY@NpM?Py6t8w1hO7;}@&(=8ayT{MohRzm;Uf zhe~hss;yNsdX>@bi(d6`4s0kwx-xFTVH)SBO8cX9UDjJ;5$i1@9pwr748-6h48s5< z;RNOTA?s&pVk_Besx4$towlrtMBVq%>#smt*f!Oq?dVD)i&dAmro_nq_?hqdZ-Qxh z_gnP#pP@C&EKPoq3};qOUtr9;=&VsI@1&0xZKpu(4#OC=WRr1qYVQC{QXeK6I!pI9 z-6x5IIg}2-gm41!F}ltZH`9dFqESih9O-e{Q5v5E$M_RKcf<5F6h+A*47rRhMRj<8 z1uEw~^OYE{S#y>&-h15f)|e61ZJ<^pW9)&BO1mIm(RpD}p}WSOzwXYSSTx?>d)%!cvdD`o$s#YQB#XSnM;3We6=AtS!Ab0X_*pC4Ad>H@TkDhDddr=tw LX-#?K@~ diff --git a/Chars_Description_0301.csv b/Chars_Description_0301.csv new file mode 100644 index 0000000..835b03f --- /dev/null +++ b/Chars_Description_0301.csv @@ -0,0 +1,507 @@ +Num,Acronym,Description,Author,Pub Year,Category,Main Formula,Other Formula,CRSP,Compustat(annual),Compustat(quarterly),IBES,description +A.1.2,Abr1,"cumulative abnormal returns around earnings announcement dates, 1-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,,,,,,p63 +A.1.2,Abr12,"cumulative abnormal returns around earnings announcement dates, 6-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,,,,,, +A.1.2,Abr6,"cumulative abnormal returns around earnings announcement dates, 12-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,,,,,, +A.2.1,Bm,Book-to-market equity,"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth,data_rawq['bm'] = data_rawq['beq']/data_rawq['me'],"data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) +data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) +* 'me' from rawq",1,0,1,0,"At the end of June of each year t, we split stocks into deciles based on Bm, which is the book equity for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Bm. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Following Davis, Fama, and French (2000), we measure book equity as stockholders' book equity, plus balance sheet deferred taxes and investment tax credit (Compustat annual item TXDITC) if available, minus the book value of preferred stock. Stockholders' equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders' equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depending on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock." +A.2.10,"Ep q 1, Ep q 6, and Ep q 12","Q Quarterly Earnings-to-price(1-month holding period), uarterly Earnings-to-price(6-month holding period), Quarterly Earnings-to-price(12-month holding period), ",Basu,1983,Value-versus-growth,,,,,,,"At the beginning of each month t, we split stocks into deciles based on quarterly earnings-to-price, Epq, which is income before extraordinary items (Compustat quarterly item IBQ) divided by the market equity (from CRSP) at the end of month t - 1. Before 1972, we use quarterly earnings from fiscal quarters ending at least four months prior to the portfolio formation. Starting from 1972, we use quarterly earnings from the most recent quarterly earnings announcement dates (item RDQ). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent quarterly earnings to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Firms with non-positive earnings are excluded. For firms with more than one share class, we merge the market equity for all share classes before computing Epq. We calculate decile returns for the current month t (Epq1), from month t to t + 5 (Epq6), and from month t to t + 11 (Epq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Epq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Epq6 decile." +A.2.12,Cp,Cash flow-to-price,"Lakonishok, Shleifer, and Vishny ",1994,Value-versus-growth,data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'],"data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] +* 'me' from rawa +",1,1,0,0,"At the end of June of each year t, we split stocks into deciles based on cash flow-to-price, Cf, which is cash flows for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. Cash flows are income before extraordinary items (Com- pustat annual item IB) plus depreciation (item DP)). For firms with more than one share class, we merge the market equity for all share classes before computing Cp. Firms with non-positive cash flows are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.13,"Cpq1, Cpq6, Cpq12","Quarterly Cash Flow-to-price (1-month holding period), Quarterly Cash Flow-to-price (6-month holding period), Quarterly Cash Flow-to-price (12-month holding period)","Lakonishok, Shleifer, and Vishny",1994,Value-versus-growth,,,,,,,"At the beginning of each month t, we split stocks into deciles based on quarterly cash flow-to-price, +Cpq, which is cash flows for the latest fiscal quarter ending at least four months ago divided by the market equity (from CRSP) at the end of month t - 1. Quarterly cash flows are income before extraordinary items (Compustat quarterly item IBQ) plus depreciation (item DPQ). For firms with more than one share class, we merge the market equity for all share classes before computing Cpq. Firms with non-positive cash flows are excluded. We calculate decile returns for the current month t (Epq1), from month t to t + 5 (Epq6), and from month t to t + 11 (Epq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Epq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Epq6 decile." +A.2.14,Dp(dy),Dividend yield,Litzenberger and Ramaswamy,1979,Value-versus-growth,"crsp_mom['dy'] = ttm12(series='mdivpay', df=crsp_mom)/crsp_mom['me']","crsp_mom['permno'] = crsp_mom['permno'].astype(int) +crsp_mom['ret'] = crsp_mom['ret'].fillna(0) +crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity +crsp_mom['retx'] = np.where(crsp_mom['me'].isnull(), 0, crsp_mom['retx']) +# dy +crsp_mom['me_l1'] = crsp_mom.groupby(['permno'])['me'].shift(1) +crsp_mom['retdy'] = crsp_mom['ret'] - crsp_mom['retx'] +crsp_mom['mdivpay'] = crsp_mom['retdy']*crsp_mom['me_l1'] +",1,0,0,0,"At the end of June of each year t, we sort stocks into deciles based on dividend yield, Dp, which is the total dividends paid out from July of year t - 1 to June of t divided by the market equity (from CRSP) at the end of June of t. We calculate monthly dividends as the begin-of-month market equity times the difference between returns with and without dividends. Monthly dividends are then accumulated from July of t - 1 to June of t. We exclude firms that do not pay dividends. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.16,Op and Nop, (Net) Payout Yield,"Richardson, and Roberts",2007,Value-versus-growth,"data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] +data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] )","# op +data_rawq['xintq0'] = np.where(data_rawq['xintq'].isnull(), 0, data_rawq['xintq']) +data_rawq['xsgaq0'] = np.where(data_rawq['xsgaq'].isnull(), 0, data_rawq['xsgaq']) +data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) +data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) +data_rawq['beq_l4'] = data_rawq.groupby(['permno'])['beq'].shift(4) + +#nop +data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] +* 'me' from rawa +data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] +",1,1,1,0,"Per Boudoukh, Michaely, Richardson, and Roberts (2007), total payouts are dividends on common stock (Compustat annual item DVC) plus repurchases. Repurchases are the total expenditure on the purchase of common and preferred stocks (item PRSTKC) plus any reduction (negative change over the prior year) in the value of the net number of preferred stocks outstanding (item PSTKRV). Net payouts equal total payouts minus equity issuances, which are the sale of common and preferred stock (item SSTK) minus any increase (positive change over the prior year) in the value of the net number of preferred stocks outstanding (item PSTKRV). At the end of June of each year t, we sort stocks into deciles based on total payouts (net payouts) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1 (Op and Nop, respectively). For firms with more than one share class, we merge the market equity for all share classes before computing Op and Nop. Firms with non-positive total payouts (zero net payouts) are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the data on total expenditure and the sale of common and preferred stocks start in 1971, the Op and Nop portfolios start in July 1972." +A.2.2,Bmj,Book-to-June-end market equity, Asness and Frazzini ,2013,Value-versus-growth,data_rawa['bmj'] = data_rawa['be_per'] / data_rawa['prc'] ,"# clean up csho +comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) +data_rawa['txditc'] = data_rawa['txditc'].fillna(0) +* 'ps' +data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] +data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) +data_rawa['be_per'] = data_rawa['be'] / data_rawa['csho'] +",0,1,0,0,"Following Asness and Frazzini (2013), at the end of June of each year t, we sort stocks into deciles based on Bmj, which is book equity per share for the fiscal year ending in calendar year t - 1 divided by share price (from CRSP) at the end of June of t. We adjust for any stock splits between the fiscal year end and the end of June. Book equity per share is book equity divided by the num- ber of shares outstanding (Compustat annual item CSHO). Following Davis, Fama, and French (2000), we measure book equity as stockholders' book equity, plus balance sheet deferred taxes and investment tax credit (item TXDITC) if available, minus the book value of preferred stock. Stockholders' equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders' equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depending on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.20,Em,Enterprise multiple,Loughran and Wellman,2011,Value-versus-growth,data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'],"* 'me' from rawa +data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che']",1,1,0,0,"Enterprise multiple, Em, is enterprise value divided by operating income before depreciation (Com- pustat annual item OIBDP). Enterprise value is the market equity plus the total debt (item DLC plus item DLTT) plus the book value of preferred stocks (item PSTKRV) minus cash and short- term investments (item CHE). At the end of June of each year t, we split stocks into deciles based on Em for the fiscal year ending in calendar year t-1. The Market equity (from CRSP) is measured at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Em. Firms with negative enterprise value or operating income before depreciation are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.21,"Emq1, Emq6","Quarterly Enterprise multiple (1-month holding period), Quarterly Enterprise multiple (6-month holding period)",Loughran and Wellman,2011,Value-versus-growth,,,,,,,"Emq, is enterprise value scaled by operating income before depreciation (Compustat quarterly item OIBDPQ). Enterprise value is the market equity plus total debt (item DLCQ plus item DLTTQ) plus the book value of preferred stocks (item PSTKQ) minus cash and short-term investments (item CHEQ). At the beginning of each month t, we split stocks into deciles on Emq for the latest fiscal quarter ending at least four months ago. The Market equity (from CRSP) is measured at the end of month t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Emq. Firms with negative enterprise value or operating income before depreciation are excluded. Monthly decile returns are calculated for the current month t (Emq1), from month t to t + 5 (Emq6), and from month t to t + 11 (Emq12), and the deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Emq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Emq6 decile. For sufficient data coverage, the EMq portfolios start in January 1975." +A.2.22,Sp,Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth,data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'],"data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) +* 'me' from rawq",1,0,1,0,"At the end of June of each year t, we sort stocks into deciles based on sales-to-price, Sp, which is sales (Compustat annual item SALE) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Sp. Firms with non-positive sales are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.23,"Sp q 1, Sp q 6, and Sp q 12",Quarterly Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on quarterly sales-to-price, Spq, which is sales (Compustat quarterly item SALEQ) divided by the market equity at the end of month t - 1. Before 1972, we use quarterly sales from fiscal quarters ending at least four months prior to the portfolio formation. Starting from 1972, we use quarterly sales from the most recent quarterly earnings announcement dates (item RDQ). Sales are generally announced with earnings during quarterly earnings announcements (Jegadeesh and Livnat 2006). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent quarterly sales to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Firms with non- positive sales are excluded. For firms with more than one share class, we merge the market equity for all share classes before computing Spq. Monthly decile returns are calculated for the current month t (Spq1), from month t to t + 5 (Spq6), and from month t to t + 11 (Spq12), and the deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Spq6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Spq6 decile." +A.2.24,Ocp,Operating Cash Flow-to-price,"Desai, Rajgopal, and Venkatachalam",2004,Value-versus-growth,"data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] +data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] )","* 'me' from rawa +data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf'])",1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on operating cash flows-to-price, Ocp, which is operating cash flows for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. Operating cash flows are measured as funds from operation (Compustat annual item FOPT) minus change in working capital (item WCAP) prior to 1988, and then as net cash flows from operating activities (item OANCF) stating from 1988. For firms with more than one share class, we merge the market equity for all share classes before computing Ocp. Firms with non-positive operating cash flows are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t+1. Because the data on funds from operation start in 1971, the Ocp portfolios start in July 1972. +" +A.2.26,Ir,Intangible Return,Daniel and Titman,2006,Value-versus-growth," +#Regression and get ir +#First get unique datelist +datelist = data_rawa['jdate'].unique() +for date in datelist: + temp = data_rawa[data_rawa['jdate'] == date] + n_row = temp.shape[0] + index = temp.index + X = pd.DataFrame() + X['bm5'] = temp['bm5'] + X['rB'] = temp['rB'] + X['intercept'] = 1 + X = X[['intercept','rB','bm5']] + X = np.mat(X) + Y = np.mat(temp[['ret5']]) + #These are residuals on one date + res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + #put residuals back into data_rawa + data_rawa.loc[index,'ir'] = res","* 'bm' from rawa +#ir +''' +#First calculate r(t-5,t). Then rb(t-5,t) and use Bm to perform linear regression and get residue +''' +#r(t-5,t):sum ret from t-5 to t (which is calendar year t-6 to t-1) +lag = pd.DataFrame() +for i in range(1,6): + lag['ret%s' % i] = data_rawa.groupby(['permno'])['ret'].shift(i) + +data_rawa['ret5'] = lag['ret1']+lag['ret2']+lag['ret3']+lag['ret4']+lag['ret5'] + +#bm_t-5 (bm of year t-5) +data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) + +#rB (five year log book return) +#Reference: jf_06 page8 by KENT DANIEL +data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] +",1,1,0,0,p77 +A.2.28,Ebp,Enterprise Book-to-price,"Penman, Richardson, and Tuna",2007,Value-versus-growth,data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['ber']) / (data_rawa['n_debt']+data_rawa['me']),"* 'me' from rawa +#Ebp +data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) +data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) +data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] +data_rawa['f_asse'] = data_rawa['che'] +# net debt : = 铿乶ancial liabilities - 铿乶ancial assets. +data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] +data_rawa['ber'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa']",1,1,0,0,"Following Penman, Richardson, and Tuna (2007), we measure enterprise book-to-price, Ebp, as the ratio of the book value of net operating assets (net debt plus book equity) to the market value of net operating assets (net debt plus market equity). Net Debt-to-price, Ndp, is the ratio of net debt to the market equity. Net debt is financial liabilities minus financial assets. We measure financial liabilities as the sum of long-term debt (Compustat annual item DLTT), debt in current liabilities (item DLC), carrying value of preferred stock (item PSTK), and preferred dividends in arrears (item DVPA, zero if missing), less preferred treasury stock (item TSTKP, zero if missing). We measure financial assets as cash and short-term investments (item CHE). Book equity is common equity (item CEQ) plus any preferred treasury stock (item TSTKP, zero if missing) less any pre- ferred dividends in arrears (item DVPA, zero if missing). Market equity is the number of common shares outstanding times share price (from CRSP). +At the end of June of each year t, we sort stocks into deciles based on Ebp, and separately, on Ndp, for the fiscal year ending in calendar year t - 1. Market equity is measured at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Ebp and Ndp. When forming the Ebp portfolios, we exclude firms with non-positive book or market value of net operating assets. For the Ndp portfolios, we exclude firms with non-positive net debt. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.3,Bmq12,Quarterly Book-to-market Equity (12-month holding period),"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth,,,,,,,p70 +A.2.9,Ep,Earnings-to-price,Basu,1983,Value-versus-growth,data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'],"data_rawq['ibq4'] = ttm4('ibq', data_rawq) +* 'me' from rawq +",1,0,1,0,"At the end of June of each year t, we split stocks into deciles based on earnings-to-price, Ep, which is income before extraordinary items (Compustat annual item IB) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before com- puting Ep. Firms with non-positive earnings are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.1,Aci,Abnormal Corporate Investment,"Titman, Wei, and Xie",2004,Investment,data_rawa['aci'] = data_rawa['ce']/ (data_rawa['ce1']+data_rawa['ce2']+data_rawa['ce3'])-1,"data_rawa['ce'] = data_rawa['capx'] / data_rawa['sale'] +data_rawa['ce1'] = data_rawa['ce'].shift(1) +data_rawa['ce2'] = data_rawa['ce'].shift(2) +data_rawa['ce3'] = data_rawa['ce'].shift(3)",0,1,0,0,"At the end of June of year t, we measure abnormal corporate investment, Aci, as Cet-1/[(Cet-2 + Cet-3 + Cet-4)/3] - 1, in which Cet-j is capital expenditure (Compustat annual item CAPX) scaled by sales (item SALE) for the fiscal year ending in calendar year t - j. The last three-year average capital expenditure is designed to project the benchmark investment in the portfolio formation year. We exclude firms with sales less than ten million dollars. At the end of June of each year t, we sort stocks into deciles based on Aci. Monthly decile returns are computed from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.10,Nsi,Net Stock Issues,Pontiff and Woodgate,2008,Investment,data_rawa['nsi'] = np.log(data_rawa['sps']/data_rawa['sps_l1']),"data_rawa['sps'] = data_rawa['csho'] * data_rawa['ajex'] +data_rawa['sps_l1'] = data_rawa.groupby('permno')['sps'].shift(1)",0,1,0,0,"At the end of June of year t, we measure net stock issues, Nsi, as the natural log of the ratio of the split-adjusted shares outstanding at the fiscal year ending in calendar year t-1 to the split-adjusted shares outstanding at the fiscal year ending in t-2. The split-adjusted shares outstanding is shares outstanding (Compustat annual item CSHO) times the adjustment factor (item AJEX). At the end of June of each year t, we sort stocks with negative Nsi into two portfolios (1 and 2), stocks with zero Nsi into one portfolio (3), and stocks with positive Nsi into seven portfolios (4 to 10). Monthly decile returns are from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.11,dIi,% Change in Investment - % Change in Industry Investment,Abarbanell and Bushee,1998,Investment,data_rawa['dIi'] = data_rawa['dinvt'] - data_rawa['dind'],"data_rawa['e_invt'] = (data_rawa['capxv'] + data_rawa['capxv'].shift(1))/2 +data_rawa['dinvt'] = (data_rawa['capxv'] - data_rawa['e_invt']) / data_rawa['e_invt'] + +data_rawa['ind'] = data_rawa['capxv'] +s = data_rawa.groupby(['jdate', 'sic2'])['ind'].sum() +data_rawa = pd.merge(data_rawa, s, on=['jdate', 'sic2']) +# new industry investment will be named as ind_y, cause it's been grouped by ind +data_rawa['e_ind'] = (data_rawa['ind_y'] + data_rawa['ind_y'].shift(1))/2 +data_rawa['dind'] = (data_rawa['ind_y']-data_rawa['e_ind']) / data_rawa['e_ind']",0,1,0,0,"Following Abarbanell and Bushee (1998), we define the %d(.) operator as the percentage change in the variable in the parentheses from its average over the prior two years, e.g., %d(Investment) = [Investment(t) - E[Investment(t)]]/E[Investment(t)], in which E[Investment(t)] = [Investment(t-1) + Investment(t - 2)]/2. dIi is defined as %d(Investment) - %d(Industry investment), in which investment is capital expenditure in property, plant, and equipment (Compustat annual item CAPXV). Industry investment is the aggregate investment across all firms with the same two- digit SIC code. Firms with non-positive E[Investment(t)] are excluded and we require at least two firms in each industry. At the end of June of each year t, we sort stocks into deciles based on dIi for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.14,Ivg,Inventory Growth,Belo and Lin,2011,Investment,,,,,,,"At the end of June of each year t, we sort stocks into deciles based on inventory growth, Ivg, which is the annual growth rate in inventory (Compustat annual item INVT) from the fiscal year ending in calendar year t - 2 to the fiscal year ending in t - 1. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.15,Ivc,Inventory Changes,Thomas and Zhang,2002,Investment,data_rawa['ivc'] = data_rawa['invt'] / data_rawa['atAvg'],"data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1) +data_rawa['atAvg'] = (data_rawa['at']+data_rawa['at_l1'])/2",1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on inventory changes, Ivc, which is the annual change in inventory (Compustat annual item INVT) scaled by the average of total assets (item AT) for the fiscal years ending in t - 2 and t - 1. We exclude firms that carry no inventory for the past two fiscal years. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.16,Oa(acc),Operating Accruals,Sloan,1996,Investment,"data_rawq['acc'] = np.select(condlist, choicelist, + default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])- + (data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/(10*data_rawq['beq'])) +","#prepare be +data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) +data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) +# acc +data_rawq['actq_l4'] = data_rawq.groupby(['permno'])['actq'].shift(4) +data_rawq['lctq_l4'] = data_rawq.groupby(['permno'])['lctq'].shift(4) +data_rawq['npq_l4'] = data_rawq.groupby(['permno'])['npq'].shift(4) +condlist = [data_rawq['npq'].isnull(), + data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] +choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/(10*data_rawq['beq']), + np.nan]",0,0,1,0,"Prior to 1988, we use the balance sheet approach in Sloan (1996) to measure operating accruals, Oa, as changes in noncash working capital minus depreciation, in which the noncash working capital is changes in noncash current assets minus changes in current liabilities less short-term debt and taxes payable. In particular, Oa equals (dCA-dCASH)-(dCL-dSTD-dTP)-DP, in which dCA is the change in current assets (Compustat annual item ACT), dCASH is the change in cash or cash equiv- alents (item CHE), dCL is the change in current liabilities (item LCT), dSTD is the change in debt included in current liabilities (item DLC), dTP is the change in income taxes payable (item TXP), and DP is depreciation and amortization (item DP). Missing changes in income taxes payable are set to zero. Starting from 1988, we follow Hribar and Collins (2002) to measure Oa using the state- ment of cash flows as net income (item NI) minus net cash flow from operations (item OANCF). Doing so helps mitigate measurement errors that can arise from nonoperating activities such as ac- quisitions and divestitures. Data from the statement of cash flows are only available since 1988. At the end of June of each year t, we sort stocks into deciles on Oa for the fiscal year ending in calendar year t - 1 scaled by total assets (item AT) for the fiscal year ending in t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. +" +A.3.17,Ta,Total Accruals,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['ta'] = data_rawa['dwc'] + data_rawa['dnco'] + data_rawa['dfin'],"#dwc +data_rawa['dwc'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc']) +* dnco +* dfin +",0,1,0,0,"Prior to 1988, we use the balance sheet approach in Richardson, Sloan, Soliman, and Tuna (2005) to measure total accruals, Ta, as dWc + dNco + dFin. dWc is the change in net non-cash working capital. Net non-cash working capital is current operating asset (Coa) minus current operating liabilities (Col), with Coa = current assets (Compustat annual item ACT) - cash and short-term investments (item CHE) and Col = current liabilities (item LCT) - debt in current liabilities (item DLC). dNco is the change in net non-current operating assets. Net non-current operating assets are non-current operating assets (Nca) minus non-current operating liabilities (Ncl), with Nca = total assets (item AT) - current assets - long-term investments (item IVAO), and Ncl = total liabilities (item LT) - current liabilities - long-term debt (item DLTT). dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (item IVST) + long-term investments, and Fnl = long-term debt + debt in current liabilities + preferred stocks (item PSTK). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero. +Starting from 1988, we use the cash flow approach to measure Ta as net income (item NI) minus total operating, investing, and financing cash flows (items OANCF, IVNCF, and FINCF) plus sales of stocks (item SSTK, zero if missing) minus stock repurchases and dividends (items PRSTKC and DV, zero if missing). Data from the statement of cash flows are only available since 1988. At the end of June of each year t, we sort stocks into deciles based on Ta for the fiscal year ending in calendar year t - 1 scaled by total assets for the fiscal year ending in t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.18,dCoa,changes in Current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dcoa'] = (data_rawa['coa']-data_rawa['coa'].shift(1)) / data_rawa['at'].shift(1),"# dCoa +data_rawa['coa'] = data_rawa['act'] - data_rawa['che']",0,1,0,0,"Richardson, Sloan, Soliman, and Tuna (2005, Table 10) show that several components of total accruals also forecast returns in the cross section. dWc is the change in net non-cash working capital. Net non-cash working capital is current operating asset (Coa) minus current operating liabilities (Col), with Coa = current assets (Compustat annual item ACT) - cash and short term investments (item CHE) and Col = current liabilities (item LCT) - debt in current liabilities (item DLC). dCoa is the change in current operating asset and dCol is the change in current operating liabilities. Missing changes in debt in current liabilities are set to zero. At the end of June of each year t, we sort stocks into deciles based, separately, on dWc, dCoa, and dCol for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.19,dNca,changes in Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dnca'] = data_rawa['nco'] - data_rawa['nco'].shift(1),"# dNca +data_rawa['ivao_0'] = np.where(data_rawa['ivao'].isnull(), 0, data_rawa['ivao']) +data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) +data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao_0'] +data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt_0'] +data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] +",0,1,0,0,"dNco is the change in net non-current operating assets. Net non-current operating assets are non- current operating assets (Nca) minus non-current operating liabilities (Ncl), with Nca = total assets (Compustat annual item AT) - current assets (item ACT) - long-term investments (item IVAO), and Ncl = total liabilities (item LT) - current liabilities (item LCT) - long-term debt (item DLTT). dNca is the change in non-current operating assets and dNcl is the change in non-current operating liabilities. Missing changes in long-term investments and long-term debt are set to zero. At the end of June of each year t, we sort stocks into deciles based, separately, on dNco, dNca, and dNcl for the fiscal year ending in calendar year t - 1, all scaled by total assets for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.19,dNco,Changes in Net Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dnco'] = data_rawa['nco'] - data_rawa['nco'].shift(1),"# dNco +data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao'] +data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt'] +data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl']",0,1,0,0,"dNco is the change in net non-current operating assets. Net non-current operating assets are non- current operating assets (Nca) minus non-current operating liabilities (Ncl), with Nca = total assets (Compustat annual item AT) - current assets (item ACT) - long-term investments (item IVAO), and Ncl = total liabilities (item LT) - current liabilities (item LCT) - long-term debt (item DLTT). dNca is the change in non-current operating assets and dNcl is the change in non-current operating liabilities. Missing changes in long-term investments and long-term debt are set to zero. At the end of June of each year t, we sort stocks into deciles based, separately, on dNco, dNca, and dNcl for the fiscal year ending in calendar year t - 1, all scaled by total assets for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.2,I/A, Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment,data_rawa['ia'] = (data_rawa['at']/data_rawa['at_l1'])-1,data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1),1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on investment-to-assets, I/A, which is measured as total assets (Compustat annual item AT) for the fiscal year ending in calendar year t-1 divided by total assets for the fiscal year ending in t-2 minus one. Monthly decile returns are computed from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.20,dBe,Changes in Net Financial Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dBe'] = (data_rawa['ceq'] - data_rawa['ceq'].shift(1)) / data_rawa['at'].shift(1),,0,1,0,0,"dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (Compustat annual item IVST) + long-term investments (item IVAO), and Fnl = long-term debt (item DLTT) + debt in current liabilities (item DLC) + preferred stock (item PSTK). dSti is the change in short-term investments, dLti is the change in long-term investments, and dFnl is the change in financial liabilities. dBe is the change in book equity (item CEQ). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero (at least one change has to be non-missing when constructing any variable). When constructing dSti (dLti), we exclude firms that do not have long-term (short-term) investments in the past two fiscal years. At the end of June of each year t, we sort stocks into deciles based, separately, on dFin, dSti, dLti, dFnl, and dBe for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.20,dFin,changes in Financial Liabilities,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dfin'] = data_rawa['dfin'] / data_rawa['at'].shift(1),"data_rawa['fna'] = data_rawa['ivst'] + data_rawa['ivao'] +data_rawa['fnl'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + +data_rawa['d_dlc'] = data_rawa['dlc'] - data_rawa['dlc'].shift(1) +data_rawa['d_dlc'] = np.where(data_rawa['d_dlc'].isnull(), 0, data_rawa['d_dlc']) +data_rawa['d_pstk'] = data_rawa['pstk'] - data_rawa['pstk'].shift(1) +data_rawa['d_pstk'] = np.where(data_rawa['d_pstk'].isnull(), 0, data_rawa['d_pstk']) + +data_rawa['dfnl'] = (data_rawa['dltt']-data_rawa['dltt'].shift(1)) + data_rawa['d_dlc'] + data_rawa['d_pstk'] + +data_rawa['d_ivst'] = data_rawa['ivst'] - data_rawa['ivst'].shift(1) +data_rawa['d_ivst'] = np.where(data_rawa['d_ivst'].isnull(), 0, data_rawa['d_ivst']) +data_rawa['d_ivao'] = data_rawa['ivao'] - data_rawa['ivao'].shift(1) +data_rawa['d_ivao'] = np.where(data_rawa['d_ivao'].isnull(), 0, data_rawa['d_ivao']) + +data_rawa['dfna'] = data_rawa['d_ivst'] + data_rawa['d_ivao'] +data_rawa['dfin'] = data_rawa['dfna'] - data_rawa['dfnl']",0,1,0,0,"dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (Compustat annual item IVST) + long-term investments (item IVAO), and Fnl = long-term debt (item DLTT) + debt in current liabilities (item DLC) + preferred stock (item PSTK). dSti is the change in short-term investments, dLti is the change in long-term investments, and dFnl is the change in financial liabilities. dBe is the change in book equity (item CEQ). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero (at least one change has to be non-missing when constructing any variable). When constructing dSti (dLti), we exclude firms that do not have long-term (short-term) investments in the past two fiscal years. At the end of June of each year t, we sort stocks into deciles based, separately, on dFin, dSti, dLti, dFnl, and dBe for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.20,dFnl,changes in Book Equity,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dfnl'] = data_rawa['dfnl'] / data_rawa['at'].shift(1),* dfnl in dFin,0,1,0,0,"dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (Compustat annual item IVST) + long-term investments (item IVAO), and Fnl = long-term debt (item DLTT) + debt in current liabilities (item DLC) + preferred stock (item PSTK). dSti is the change in short-term investments, dLti is the change in long-term investments, and dFnl is the change in financial liabilities. dBe is the change in book equity (item CEQ). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero (at least one change has to be non-missing when constructing any variable). When constructing dSti (dLti), we exclude firms that do not have long-term (short-term) investments in the past two fiscal years. At the end of June of each year t, we sort stocks into deciles based, separately, on dFin, dSti, dLti, dFnl, and dBe for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.22,Poa,Percent operating accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment,data_rawa['poa'] = data_rawa['oa']/data_rawa['ni'],* oa(acc),0,1,0,0,"Accruals are traditionally scaled by total assets. Hafzalla, Lundholm, and Van Winkle (2011) show that scaling accruals by the absolute value of earnings (percent accruals) is more effective in se- lecting firms for which the differences between sophisticated and naive forecasts of earnings are the most extreme. To construct the percent operating accruals (Poa) deciles, at the end of June of each year t, we sort stocks into deciles based on operating accruals scaled by the absolute value of net income (Compustat annual item NI) for the fiscal year ending in calendar year t - 1. See Appendix A.3.16 for the measurement of operating accruals. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.23,Pta,Percent total accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment,,,,,,,"At the end of June of each year t, we sort stocks into deciles on percent total accruals, Pta, cal- culated as total accruals scaled by the absolute value of net income (Compustat annual item NI) for the fiscal year ending in calendar year t - 1. See Appendix A.3.17 for the measurement of total accruals. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of year t + 1." +A.3.24,Pda,Percent discretionary accruals,,,Investment,,,,,,,"At the end of June of each year t, we split stocks into deciles based on percent discretionary accruals, Pda, calculated as the discretionary accruals, Dac, for the fiscal year ending in calendar year t - 1 multiplied with total assets (Compustat annual item AT) for the fiscal year ending in t - 2 scaled by the absolute value of net income (item NI) for the fiscal year ending in t - 1. See Appendix A.3.21 for the measurement of discretionary accruals. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.25,Ndf,Net debt finance,"Bradshaw, Richardson, and Sloan",2006,Investment,data_rawa['ndf'] = data_rawa['dltis'] - data_rawa['dltr'] + data_rawa['dlcch'] ,,0,1,0,0,"Net external financing, Nxf, is the sum of net equity financing, Nef, and net debt financing, Ndf (Bradshaw, Richardson, and Sloan 2006). Nef is the proceeds from the sale of common and pre- ferred stocks (Compustat annual item SSTK) less cash payments for the repurchases of common and preferred stocks (item PRSTKC) less cash payments for dividends (item DV). Ndf is the cash proceeds from the issuance of long-term debt (item DLTIS) less cash payments for long-term debt reductions (item DLTR) plus the net changes in current debt (item DLCCH, zero if missing). At the end of June of each year t, we sort stocks into deciles based on Nxf, and, separately, on Nef and Ndf, for the fiscal year ending in calendar year t - 1 scaled by the average of total assets for fiscal years ending in t - 2 and t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the data on financing activities start in 1971, the portfolios start in July 1972." +A.3.3,"Ia q 6, and Ia q 12",Quarterly Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment,data_rawq['iaq'] = (data_rawq['atq']/data_rawq['atqlag'])-1,"data_rawq['atqlag'] = ttm4('atq',data_rawq)",0,0,1,0,"Quarterly investment-to-assets, Iaq, is defined as quarterly total assets (Compustat quarterly item ATQ) divided by four-quarter-lagged total assets minus one. At the beginning of each month t, we sort stocks into deciles based on Iaq for the latest fiscal quarter ending at least four months ago. Monthly decile returns are calculated for the current month t (Iaq1), from month t to t + 5 (Iaq6), and from month t to t + 11 (Iaq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Iaq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Iaq6 decile." +A.3.4,dPia,Changes in PPE and Inventory-to-assets,"Lyandres, Sun, and Zhang",2008,Investment,data_rawa['dpia'] = (data_rawa['c_propty'] + data_rawa['c_invt']) / data_rawa['at'].shift(1),"data_rawa['c_propty'] = data_rawa['ppegt'] - data_rawa['ppegt'].shift(1) +data_rawa['c_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1)",0,1,0,0,"Changes in PPE and Inventory-to-assets, dPia, is defined as the annual change in gross property, plant, and equipment (Compustat annual item PPEGT) plus the annual change in inventory (item INVT) scaled by one-year-lagged total assets (item AT). At the end of June of each year t, we sort stocks into deciles based on dPia for the fiscal year ending in calendar year t-1. Monthly decile re- turns are computed from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.5,Noa and dNoa,(Changes in) Net Operating Assets,"Hirshleifer, Hou, Teoh, and Zhang",2004,Investment,"data_rawq['noa'] = (data_rawq['atq']-data_rawq['cheq']-data_rawq['ivaoq'])-\ + (data_rawq['atq']-data_rawq['dlcq']-data_rawq['dlttq']-data_rawq['mibq']-data_rawq['pstkq']-data_rawq['ceqq'])/data_rawq['atq_l4'] +data_rawa['dnoa'] = (data_rawa['net_op']-data_rawa['net_op'].shift(1))/ data_rawa['at'].shift(1) +","#noa +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) +data_rawq['ivaoq'] = np.where(data_rawq['ivaoq'].isnull(), 0, 1) +data_rawq['dlcq'] = np.where(data_rawq['dlcq'].isnull(), 0, 1) +data_rawq['dlttq'] = np.where(data_rawq['dlttq'].isnull(), 0, 1) +data_rawq['mibq'] = np.where(data_rawq['mibq'].isnull(), 0, 1) +data_rawq['pstkq'] = np.where(data_rawq['pstkq'].isnull(), 0, 1) +# dNoa +data_rawa['dlc_0'] = np.where(data_rawa['dlc'].isnull(), 0, data_rawa['dlc']) +data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) +data_rawa['mib_0'] = np.where(data_rawa['mib'].isnull(), 0, data_rawa['mib']) +data_rawa['pstk_0'] = np.where(data_rawa['pstk'].isnull(), 0, data_rawa['pstk']) + +data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] +data_rawa['op_lia'] = data_rawa['at'] - data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] +data_rawa['net_op'] = data_rawa['op_at'] - data_rawa['op_lia']",1,1,1,0,"Following Hirshleifer, Hou, Teoh, and Zhang (2004), we measure net operating assets as operating assets minus operating liabilities. Operating assets are total assets (Compustat annual item AT) minus cash and short-term investment (item CHE). Operating liabilities are total assets minus debt included in current liabilities (item DLC, zero if missing), minus long-term debt (item DLTT, zero if missing), minus minority interests (item MIB, zero if missing), minus preferred stocks (item PSTK, zero if missing), and minus common equity (item CEQ). Noa is net operating assets scalded by one-year-lagged total assets. Changes in net operating assets, dNoa, is the annual change in net operating assets scaled by one-year-lagged total assets. At the end of June of each year t, we sort stocks into deciles based on Noa, and separately, on dNOA, for the fiscal year ending in calendar year t - 1. Monthly decile returns are computed from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.6,dLno,Changes in Long-term Net Operating Assets,"Fairfield, Whisenant, and Yohn",2003,Investment,"data_rawa['dlno'] = (data_rawa['ppent']-data_rawa['ppent'].shift(1)) + (data_rawa['intan']-data_rawa['intan'].shift(1)) + (data_rawa['ao']-data_rawa['ao'].shift(1)) - (data_rawa['lo']-data_rawa['lo'].shift(1)) + data_rawa['dp'] +* +data_rawa['dlno'] = data_rawa['dlno'] / data_rawa['avg_at']","* +avg_at = [] +for i in range(data_rawa.shape[0]): + avg_at.append(data_rawa.loc[0:i, 'at'].mean()) +data_rawa['avg_at'] = pd.DataFrame(avg_at)",0,1,0,0,"Following Fairfield, Whisenant, and Yohn (2003), we measure changes in long-term net operating assets as the annual change in net property, plant, and equipment (Compustat item PPENT) plus the change in intangibles (item INTAN) plus the change in other long-term assets (item AO) minus the change in other long-term liabilities (item LO) and plus depreciation and amortization expense (item DP). dLno is the change in long-term net operating assets scaled by the average of total assets (item AT) from the current and prior years. At the end of June of each year t, we sort stocks into deciles based on dLno for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.7,Ig,Investment Growth,Xing,2008,Investment,data_rawa['ig'] = data_rawa['capx']/data_rawa['capx_l1'],data_rawa['capx_l1'] = data_rawa.groupby('permno')['capx'].shift(1),1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on investment growth, Ig, which is the growth rate in capital expenditure (Compustat annual item CAPX) from the fiscal year ending in calendar year t - 2 to the fiscal year ending in t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.8,2Ig,2-year Investment Growth,Anderson and Garcia-Feijoo,2006,Investment,data_rawa['2ig'] = data_rawa['capx']/data_rawa['capx_l2'],data_rawa['capx_l2'] = data_rawa.groupby('permno')['capx'].shift(2),0,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on two-year investment growth, 2Ig, which is the growth rate in capital expenditure (Compustat annual item CAPX) from the fiscal year ending in calendar year t - 3 to the fiscal year ending in t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.1,"Roe1, Roe6",Return on Equity,"Hou, Xue, and Zhang",2015,Profitability,data_rawq['roe'] = data_rawq['ibq']/data_rawq['ceqq_l1'],data_rawq['ceqq_l1'] = data_rawq.groupby(['permno'])['ceqq'].shift(1),1,0,1,0,"Return on equity, Roe, is income before extraordinary items (Compustat quarterly item IBQ) di- vided by one-quarter-lagged book equity (Hou, Xue, and Zhang 2015). Book equity is shareholders' equity, plus balance sheet deferred taxes and investment tax credit (item TXDITCQ) if available, minus the book value of preferred stock (item PSTKQ). Depending on availability, we use stockhold- ers' equity (item SEQQ), or common equity (item CEQQ) plus the book value of preferred stock, or total assets (item ATQ) minus total liabilities (item LTQ) in that order as shareholders' equity." +A.4.11,"Gla q 1, Gla q 6, and Gla q 12",Quarterly Gross Profits-to-lagged Assets,,,Profitability,,,,,,,"Glaq, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ) divided by one-quarter-lagged total assets (item ATQ). At the beginning of each month t, we sort stocks into deciles based on Glaq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Glaq1), from month t to t+5 (Glaq6), and from month t to t + 11 (Glaq12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Glaq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Glaq6 decile. For sufficient data coverage, the Glaq portfolios start in January 1976." +A.4.12,Ope(operprof),Operating Profits to Equity,Fama and French,2015,Profitability,data_rawa['operprof'] = (data_rawa['revt']-data_rawa['cogs']-data_rawa['xsga0']-data_rawa['xint0'])/data_rawa['ceq_l1'],"data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) +data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) +data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga'])",0,1,0,0,"Following Fama and French (2015), we measure operating profitability to equity, Ope, as total rev- enue (Compustat annual item REVT) minus cost of goods sold (item COGS, zero if missing), minus selling, general, and administrative expenses (item XSGA, zero if missing), and minus interest ex- pense (item XINT, zero if missing), scaled by book equity (the denominator is current, not lagged, book equity). We require at least one of the three expense items (COGS, XSGA, and XINT) to be non-missing. Book equity is stockholders' book equity, plus balance sheet deferred taxes and investment tax credit (item TXDITC) if available, minus the book value of preferred stock. Stock- holders' equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders' equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depend- ing on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock. At the end of June of each year t, we sort stocks into deciles based on Ope for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.14,"Ole q 1, Ole q 6 ",Quarterly Operating Profits-to-lagged Equity,,,Profitability,,,,,,,"Quarterly operating profits-to-lagged equity, Oleq, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ, zero if missing), minus selling, general, and administrative expenses (item XSGAQ, zero if missing), and minus interest expense (item XINTQ, zero if missing), scaled by one-quarter-lagged book equity. We require at least one of the three expense items (COGSQ, XSGAQ, and XINTQ) to be non-missing. Book equity is shareholders' equity, plus balance sheet deferred taxes and investment tax credit (item TXDITCQ) if available, minus the book value of preferred stock (item PSTKQ). Depending on availability, we use stockhold- ers' equity (item SEQQ), or common equity (item CEQQ) plus the book value of preferred stock, or total assets (item ATQ) minus total liabilities (item LTQ) in that order as shareholders' equity. +At the beginning of each month t, we split stocks on Oleq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Oleq 1), from month t to t + 5 (Oleq6), and from month t to t + 11 (Oleq12). The deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Oleq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Oleq6 decile. For sufficient data coverage, the Oleq portfolios start in January 1972." +A.4.15,Opa,Operating Profits-to-assets,"Linnainmaa, and Nikolaev",2015,Profitability,,,,,,,"Following Ball, Gerakos, Linnainmaa, and Nikolaev (2015), we measure operating profits-to-assets, Opa, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS), minus selling, general, and administrative expenses (item XSGA), and plus research and develop- ment expenditures (item XRD, zero if missing), scaled by book assets (item AT, the denominator is current, not lagged, total assets). At the end of June of each year t, we sort stocks into deciles based on Opa for the fiscal year ending in calendar year t-1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.17,"Ola q 1, Ola q 6, and Ola q 12",Quarterly Operating Profits-to-lagged Assets,,,Profitability,,,,,,,"Quarterly operating profits-to-lagged assets, Olaq, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ), minus selling, general, and administra- tive expenses (item XSGAQ), plus research and development expenditures (item XRDQ, zero if missing), scaled by one-quarter-lagged book assets (item ATQ). At the beginning of each month t, we sort stocks into deciles based on Olaq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Olaq1), from month t to t+5 (Olaq6), and from month t to t + 11 (Olaq12). The deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Olaq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Olaq6 decile. For sufficient data coverage, the Olaq portfolios start in January 1976." +A.4.18,Cop,Cash-based Operating Profitability,"Gerakos, Linnainmaa, and Nikolaev",2016,Profitability,"data_rawa['cop'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ + - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ + + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] +data_rawa['cop'] = data_rawa['cop'] / data_rawa['at'] ",* Cla,0,1,0,0,"Following Ball, Gerakos, Linnainmaa, and Nikolaev (2016), we measure cash-based operating prof- itability, Cop, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS), minus selling, general, and administrative expenses (item XSGA), plus research and de- velopment expenditures (item XRD, zero if missing), minus change in accounts receivable (item RECT), minus change in inventory (item INVT), minus change in prepaid expenses (item XPP), plus change in deferred revenue (item DRC plus item DRLT), plus change in trade accounts payable (item AP), and plus change in accrued expenses (item XACC), all scaled by book assets (item AT, the denominator is current, not lagged, total assets). All changes are annual changes in balance sheet items and we set missing changes to zero. At the end of June of each year t, we sort stocks into deciles based on Cop for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.19,Cla,Cash-based Operating Profits-to-lagged Assets,,,Profitability,"data_rawa['cla'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ + - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ + + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] +data_rawa['cla'] = data_rawa['cla'] / data_rawa['at'].shift(1)","data_rawa['d_rect'] = data_rawa['rect'] - data_rawa['rect'].shift(1) +data_rawa['d_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) +data_rawa['d_xpp'] = data_rawa['xpp'] - data_rawa['xpp'].shift(1) +data_rawa['d_dr'] = (data_rawa['drc']-data_rawa['drc'].shift(1)) + (data_rawa['drlt']-data_rawa['drlt'].shift(1)) +data_rawa['d_ap'] = data_rawa['ap'] - data_rawa['ap'].shift(1) +data_rawa['d_xacc'] = data_rawa['xacc'] - data_rawa['xacc'].shift(1) + +data_rawa['xrd_0'] = np.where(data_rawa['xrd'].isnull(), 0, data_rawa['xrd']) +data_rawa['d_rect_0'] = np.where(data_rawa['d_rect'].isnull(), 0, data_rawa['d_rect']) +data_rawa['d_invt_0'] = np.where(data_rawa['d_invt'].isnull(), 0, data_rawa['d_invt']) +data_rawa['d_xpp_0'] = np.where(data_rawa['d_xpp'].isnull(), 0, data_rawa['d_xpp']) +data_rawa['d_dr_0'] = np.where(data_rawa['d_dr'].isnull(), 0, data_rawa['d_dr']) +data_rawa['d_ap_0'] = np.where(data_rawa['d_ap'].isnull(), 0, data_rawa['d_ap']) +data_rawa['d_xacc_0'] = np.where(data_rawa['d_xacc'].isnull(), 0, data_rawa['d_xacc'])",0,1,0,0,"Cash-based operating profits-to-lagged assets, Cla, is total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS), minus selling, general, and administrative expenses (item XSGA), plus research and development expenditures (item XRD, zero if missing), minus change in accounts receivable (item RECT), minus change in inventory (item INVT), minus change in prepaid expenses (item XPP), plus change in deferred revenue (item DRC plus item DRLT), plus change in trade accounts payable (item AP), and plus change in accrued expenses (item XACC), all scaled by one-year-lagged book assets (item AT). All changes are annual changes in balance sheet items and we set missing changes to zero. At the end of June of each year t, we sort stocks into deciles based on Cla for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.2,"dRoe1, dRoe6, and dRoe12",Changes in Return on Equity,"Hou, Xue, and Zhang",2015,Profitability,,,,,,,"Change in return on equity, dRoe, is return on equity minus its value from four quarters ago. See Appendix A.4.1 for the measurement of return on equity. At the beginning of each month t, we sort all stocks into deciles on their most recent past dRoe. Before 1972, we use the most recent dRoe with quarterly earnings from fiscal quarters ending at least four months ago. Starting from 1972, we use dRoe computed with quarterly earnings from the most recent quarterly earnings announcement dates (Compustat quarterly item RDQ). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent dRoe to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for the current month t (dRoe1), from month t to t + 5 (dRoe6), and from month t to t + 11 (dRoe12). The deciles are rebalanced monthly. The holding period that is longer than one month as in, for instance, dRoe6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdeciles returns as the monthly return of the dRoe6 decile." +A.4.20,Claq,Quarterly Cash-based Operating Profits-to-lagged Assets,,,Profitability,,,,,,,"Quarterly cash-based operating profits-to-lagged assets, Cla, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ), minus selling, general, and ad- ministrative expenses (item XSGAQ), plus research and development expenditures (item XRDQ, zero if missing), minus change in accounts receivable (item RECTQ), minus change in inventory (item INVTQ), plus change in deferred revenue (item DRCQ plus item DRLTQ), and plus change in trade accounts payable (item APQ), all scaled by one-quarter-lagged book assets (item ATQ). All changes are quarterly changes in balance sheet items and we set missing changes to zero. At the beginning of each month t, we split stocks on Claq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Claq1), from month t to t + 5 (Claq6), and from month t to t + 11 (Claq12). The deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Claq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Claq6 decile. For sufficient data coverage, the Claq portfolios start in January 1976." +A.4.29,Tbi q 12,Quarterly Taxable Income-to-book Income,"Green, Hand, and Zhang",2013,Profitability,,,,,,,"Quarterly taxable income-to-book income, Tbiq, is quarterly pretax income (Compustat quarterly item PIQ) divided by net income (NIQ). At the beginning of each month t, we split stocks into deciles based on Tbiq calculated with accounting data from the fiscal quarter ending at least four months ago. We exclude firms with non-positive pretax income or net income. We calculate monthly decile returns for the current month t (Tbiq1), from month t to t + 5 (Tbiq6), and from month t to t+11 (Tbiq12). The deciles are rebalanced at the beginning of month t+1. The holding period that is longer than one month as in, for instance, Tbiq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Tbiq6 decile." +A.4.3,Roa1,Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability,data_rawq['roa'] = data_rawq['ibq']/data_rawq['atq_l1'],data_rawq['atq_l1'] = data_rawq.groupby(['permno'])['atq'].shift(1),1,0,1,0,"Return on assets, Roa, is income before extraordinary items (Compustat quarterly item IBQ) di- vided by one-quarter-lagged total assets (item ATQ). At the beginning of each month t, we sort all stocks into deciles based on Roa computed with quarterly earnings from the most recent earnings announcement dates (item RDQ). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent Roa to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corre- sponding fiscal quarter end. Monthly decile returns are calculated for month t (Roa1), from month t to t+5 (Roe6), and from month t to t+11 (Roe12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Roa6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdeciles returns as the monthly return of the Roa6 decile. For sufficient data coverage, the Roa portfolios start in January 1972." +A.4.4,"dRoa1, dRoa6",Changes in Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability,,,,,,,"Change in return on assets, dRoa, is return on assets minus its value from four quarters ago. See Appendix A.4.3 for the measurement of return on assets. At the beginning of each month t, we sort all stocks into deciles based on dRoa computed with quarterly earnings from the most recent earnings announcement dates (Compustat quarterly item RDQ). For a firm to enter the portfo- lio formation, we require the end of the fiscal quarter that corresponds to its most recent dRoa to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for month t (dRoa1), from month t to t + 5 (dRoa6), and from month t to t + 11 (dRoa12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, dRoa6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the dRoa6 decile. For sufficient data coverage, the dRoa portfolios start in January 1973." +A.4.5,Ato,Asset Turnover,Soliman,2008,Profitability,data_rawq['ato'] = data_rawq['saleq']/data_rawq['noa_l4'],"* noa +* noa_l4 from rna",1,0,1,0,"Soliman (2008) use DuPont analysis to decompose Roe as Rna + FLEV * SPREAD, in which Roe is return on equity, Rna is return on net operating assets, FLEV is financial leverage, and SPREAD is the difference between return on net operating assets and borrowing costs. We can further decompose Rna as Pm * Ato, in which Pm is profit margin and Ato is asset turnover. +Following Soliman (2008), we use annual sorts to form Rna, Pm, and Ato deciles. At the end of June of year t, we measure Rna as operating income after depreciation (Compustat annual item OIADP) for the fiscal year ending in calendar year t - 1 divided by net operating assets (Noa) for the fiscal year ending in t - 2. Noa is operating assets minus operating liabilities. Operating assets are total assets (item AT) minus cash and short-term investment (item CHE), and minus other investment and advances (item IVAO, zero if missing). Operating liabilities are total assets minus debt in current liabilities (item DLC, zero if missing), minus long-term debt (item DLTT, zero if missing), minus minority interests (item MIB, zero if missing), minus preferred stocks (item PSTK, zero if missing), and minus common equity (item CEQ). Pm is operating income after depreciation divided by sales (item SALE) for the fiscal year ending in calendar year t - 1. Ato is sales for the fiscal year ending in calendar year t - 1 divided by Noa for the fiscal year ending in t - 2. At the end of June of each year t, we sort stocks into three sets of deciles based on Rna, Pm, and Ato. We exclude firms with non-positive Noa for the fiscal year ending in calendar year t - 2 when forming the Rna and the Ato portfolios. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.5,pm,profit margin,Soliman,2008,Profitability,,,,,,,"Soliman (2008) use DuPont analysis to decompose Roe as Rna + FLEV * SPREAD, in which Roe is return on equity, Rna is return on net operating assets, FLEV is financial leverage, and SPREAD is the difference between return on net operating assets and borrowing costs. We can further decompose Rna as Pm * Ato, in which Pm is profit margin and Ato is asset turnover. +Following Soliman (2008), we use annual sorts to form Rna, Pm, and Ato deciles. At the end of June of year t, we measure Rna as operating income after depreciation (Compustat annual item OIADP) for the fiscal year ending in calendar year t - 1 divided by net operating assets (Noa) for the fiscal year ending in t - 2. Noa is operating assets minus operating liabilities. Operating assets are total assets (item AT) minus cash and short-term investment (item CHE), and minus other investment and advances (item IVAO, zero if missing). Operating liabilities are total assets minus debt in current liabilities (item DLC, zero if missing), minus long-term debt (item DLTT, zero if missing), minus minority interests (item MIB, zero if missing), minus preferred stocks (item PSTK, zero if missing), and minus common equity (item CEQ). Pm is operating income after depreciation divided by sales (item SALE) for the fiscal year ending in calendar year t - 1. Ato is sales for the fiscal year ending in calendar year t - 1 divided by Noa for the fiscal year ending in t - 2. At the end of June of each year t, we sort stocks into three sets of deciles based on Rna, Pm, and Ato. We exclude firms with non-positive Noa for the fiscal year ending in calendar year t - 2 when forming the Rna and the Ato portfolios. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.6,Cto,Capital Turnover,Haugen and Baker,1996,Profitability,data_rawa['cto'] = data_rawa['sale'] / data_rawa['at'].shift(1),,0,1,0,0,"At the end of June of each year t, we split stocks into deciles based on capital turnover, Cto, measured as sales (Compustat annual item SALE) for the fiscal year ending in calendar year t - 1 divided by total assets (item AT) for the fiscal year ending in t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.7,"Rna q 1, Rna q 6, Atoq1","Quarterly Return on Net Operating Assets, Quarterly Asset Turnover",Soliman,2008,Profitability,data_rawq['rna'] = data_rawq['oiadpq']/data_rawq['noa_l4'],"* noa +data_rawq['noa_l4'] = data_rawq.groupby(['permno'])['noa'].shift(4)",1,0,1,0,"Quarterly return on net operating assets, Rnaq, is quarterly operating income after depreciation (Compustat quarterly item OIADPQ) divided by one-quarter-lagged net operating assets (Noa). Noa is operating assets minus operating liabilities. Operating assets are total assets (item ATQ) minus cash and short-term investments (item CHEQ), and minus other investment and advances (item IVAOQ, zero if missing). Operating liabilities are total assets minus debt in current liabilities (item DLCQ, zero if missing), minus long-term debt (item DLTTQ, zero if missing), minus minority interests (item MIBQ, zero if missing), minus preferred stocks (item PSTKQ, zero if missing), and minus common equity (item CEQQ). Quarterly profit margin, Pmq, is quarterly operating income after depreciation divided by quarterly sales (item SALEQ). Quarterly asset turnover, Atoq, is quarterly sales divided by one-quarter-lagged Noa. +At the beginning of each month t, we sort stocks into deciles based on Rnaq or Pmq for the latest fiscal quarter ending at least four months ago. Separately, we sort stocks into deciles based on Atoq computed with quarterly sales from the most recent earnings announcement dates (item RDQ). Sales are generally announced with earnings during quarterly earnings announcements (Je- gadeesh and Livnat 2006). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent Atoq to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for month t (Rnaq1, Pmq1, and Atoq1), from month t to t+5 (Rnaq6, Pmq6, and Atoq6), and from month t to t+11 (Rnaq12, Pmq12, and Atoq12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Atoq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Atoq6 decile. For sufficient data coverage, the Rnaq portfolios start in January 1976 and the Atoq portfolios start in January 1972. +" +A.4.8,"Cto q1, Cto q6,",Quarterly Capital Turnover,Haugen and Baker,1996,Profitability,,,,,,,"Quarterly capital turnover, Ctoq, is quarterly sales (Compustat quarterly item SALEQ) scaled by one-quarter-lagged total assets (item ATQ). At the beginning of each month t, we sort stocks into deciles based on Ctoq computed with quarterly sales from the most recent earnings announcement dates (item RDQ). Sales are generally announced with earnings during quarterly earnings announce- ments (Jegadeesh and Livnat 2006). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent Atoq to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for month t (Ctoq1), from month t to t+5 (Ctoq6), and from month t to t + 11 (Ctoq12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Ctoq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Ctoq6 decile. For sufficient data coverage, the Ctoq portfolios start in January 1972. +" +A.4.9,Gpa,Gross Profits-to-assets,Novy-Marx,2013,Profitability,,,,,,,"Following Novy-Marx (2013), we measure gross profits-to-assets, Gpa, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS) divided by total assets (item AT, the denominator is current, not lagged, total assets). At the end of June of each year t, we sort stocks into deciles based on Gpa for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.5.1,Oca and Ioca,(Industry-adjusted) Organizational Capital-to-assets,Eisfeldt and Papanikolaou,2013,Intangibles,,,,,,,p101 +A.5.11,Rca,R&D Capital-to-assets,Li,2011,Intangibles,,,,,,,"Following Li (2011), we measure R&D capital, Rc, by accumulating annual R&D expenses over the past five years with a linear depreciation rate of 20%: +Rcit = XRDit + 0.8 XRDit-1 + 0.6 XRDit-2 + 0.4 XRDit-3 + 0.2 XRDit-4, (A18) +in which XRDit-j is firm i's R&D expenses (Compustat annual item XRD) in year t - j. R&D capital-to-assets, Rca, is Rc scaled by total assets (item AT). At the end of June of each year t, we sort stocks into deciles based on Rca for the fiscal year ending in calendar year t - 1. We keep only firms with positive Rc. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. For portfolio formation at the end of June of year t, we require R&D expenses to be non-missing for the fiscal year ending in calendar year t - 1, because this value of R&D expenses receives the highest weight in Rc. Because Rc requires past five years of R&D expenses data and the accounting treatment of R&D expenses was standardized in 1975, the Rca portfolios start in July 1980." +A.5.2,Adm,Advertising Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,data_rawa['adm'] = data_rawa['xad']/data_rawa['me'],* me from rawa,1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on advertising expenses-to- market, Adm, which is advertising expenses (Compustat annual item XAD) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Adm. We keep only firms with positive advertising expenses. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because sufficient XAD data start in 1972, the Adm portfolios start in July 1973." +A.5.24,Etr,Effective Tax Rate,Abarbanell and Bushee,1998,Intangibles,data_rawa['etr'] = (data_rawa['txtpi'] - (data_rawa['txtpi_l1'] + data_rawa['txtpi_l2'] + data_rawa['txtpi_l3'])/3) * data_rawa['deps'],"data_rawa['txtpi'] = data_rawa['txt'] / data_rawa['pi'] +data_rawa['txtpi_l1'] = data_rawa.groupby('permno')['txtpi'].shift(1) +data_rawa['txtpi_l2'] = data_rawa.groupby('permno')['txtpi'].shift(2) +data_rawa['txtpi_l3'] = data_rawa.groupby('permno')['txtpi'].shift(3) +data_rawa['deps'] = data_rawa['epspx']/(data_rawa['ajex'] * data_rawa['prcc_f'])",0,1,0,0,p108 +A.5.4,Rdm,R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'],"* me from rawq +# rd +data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) +data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4'])",1,0,1,0,"At the end of June of each year t, we sort stocks into deciles based on R&D-to-market, Rdm, which is R&D expenses (Compustat annual item XRD) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Rdm. We keep only firms with positive R&D expenses. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the accounting treatment of R&D expenses was standardized in 1975, the Rdm portfolios start in July 1976." +A.5.46,"Alm q 1, Alm q 6, and Alm q 12",Quarterly Asset Liquidity,Ortiz-Molina and Phillips,2014,Intangibles,data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']),"data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ + 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) +* me from rawq",1,0,1,0,"We measure quarterly asset liquidity as cash + 0.75 * noncash current assets + 0.50 * tangible fixed assets. Cash is cash and short-term investments (Compustat quarterly item CHEQ). Noncash current assets is current assets (item ACTQ) minus cash. Tangible fixed assets is total assets (item ATQ) minus current assets (item ACTQ), minus goodwill (item GDWLQ, zero if missing), and minus intangibles (item INTANQ, zero if missing). Alaq is quarterly asset liquidity scaled by one- quarter-lagged total assets. Almq is quarterly asset liquidity scaled by one-quarter-lagged market value of assets. Market value of assets is total assets plus market equity (item PRCCQ times item CSHOQ) minus book equity (item CEQQ). +At the beginning of each month t, we sort stocks into deciles based on Alaq, and separately, on Almq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for the current month t (Alaq1 and Almq1), from month t to t + 5 (Alaq6 and Almq6), and from month t to t+11 (Alaq12 and Almq12). The deciles are rebalanced at the beginning of month t+1. The holding period longer than one month as in Alaq6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Alaq6 decile. For sufficient data coverage, the quarterly asset liquidity portfolios start in January 1976. +" +A.5.5,"Rdm q 1, Rdm q 6, and Rdm q 12",Quarterly R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,,,,,,"At the beginning of each month t, we split stocks into deciles based on quarterly R&D-to-market, Rdmq, which is quarterly R&D expense (Compustat quarterly item XRDQ) for the fiscal quarter ending at least four months ago scaled by the market equity (from CRSP) at the end of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Rdmq. We keep only firms with positive R&D expenses. We calculate decile returns for the current month t (Rdmq1), from month t to t + 5 (Rdmq6), and from month t to t + 11 (Rdmq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Rdmq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Rdmq6 decile. Because the quarterly R&D data start in late 1989, the Rdmq portfolios start in January 1990." +A.5.50,"R a 1 , R n 1 , R a [2,5] , R n[2,5] , R a[6,10] , R n[6,10] , R a[11,15] , and R a[16,20]",Seasonality,Heston and Sadka,2008,Intangibles,,"* crsp_mom +#Rla +crsp_mom['rla'] = crsp_mom.groupby(['permno'])['ret'].shift(12) + +#Rln +lag = pd.DataFrame() +result = 0 +for i in range(1, 12): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['rln'] = result/11 + +#R[2,5]a +#R[2,5]n +lag = pd.DataFrame() +result = 0 +for i in range(13,61): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + if i not in [24,36,48,60]: + result = result + lag['mom%s' % i] + +crsp_mom['r25a'] = (lag['mom24']+lag['mom36']+lag['mom48']+lag['mom60'])/4 +crsp_mom['r25n'] = result/44 + +#R[6,10]a +#R[6,10]n +lag = pd.DataFrame() +result = 0 +for i in range(61,121): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + if i not in [72,84,96,108,120]: + result = result + lag['mom%s' % i] + +crsp_mom['r610a'] = (lag['mom72']+lag['mom84']+lag['mom96']+lag['mom108']+lag['mom120'])/5 +crsp_mom['r610n'] = result/55 + +#R[11,15]a +lag = pd.DataFrame() +result = 0 +for i in [132,144,156,168,180]: + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['r1115a'] = result/5 + +#R[16,20]a +lag = pd.DataFrame() +result = 0 +for i in [192,204,216,228,240]: + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['r1620a'] = result/5",1,0,0,0,"Following Heston and Sadka (2008), at the beginning of each month t, we sort stocks into deciles +based on various measures of past performance, including returns in month t - 12 (Ra1), average +returns from month t - 11 to t - 1 (Rn1), average returns across months t - 24,t - 36,t - 48, and +t - 60 (R[2,5]), average returns from month t - 60 to t - 13 except for lags 24, 36, 48, and 60 (R[2,5]), an +average returns across months t - 72, t - 84, t - 96, t - 108, and t - 120 (R[6,10]), average returns a +from month t - 120 to t - 61 except for lags 72, 84, 96, 108, and 120 (R[6,10]), average returns across n +months t - 132, t - 144, t - 156, t - 168, and t - 180 (R[11,15]), average returns from month t - 180 a +to t - 121 except for lags 132, 144, 156, 168, and 180 (R[11,15]), average returns across months n +t-192,t-204,t-216,t-228, and t-240 (R[16,20]), average returns from month t-240 to t-181 a +except for lags 192, 204, 216, 228, and 240 (R[16,20]). Monthly decile returns are calculated for the n +current month t, and the deciles are rebalanced at the beginning of month t + 1." +A.5.6,Rds q 6 and Rds q 12,Quarterly R&D Expense-to-sales,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,data_rawq['rds'] = data_rawq['xrdq4']/data_rawq['saleq'],* xrdq4 from rdm,0,0,1,0,"At the end of June of each year t, we sort stocks into deciles based on R&D-to-sales, Rds, which is R&D expenses (Compustat annual item XRD) divided by sales (item SALE) for the fiscal year ending in calendar year t - 1. We keep only firms with positive R&D expenses. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the accounting treatment of R&D expenses was standardized in 1975, the Rds portfolios start in July 1976." +A.5.8,Ol,Operating Leverage,Novy-Marx,2011,Intangibles,data_rawa['ol'] = (data_rawa['cogs'] + data_rawa['xsga'])/data_rawa['at'],,0,1,0,0,"Following Novy-Marx (2011), operating leverage, Ol, is operating costs scaled by total assets (Com- pustat annual item AT, the denominator is current, not lagged, total assets). Operating costs are cost of goods sold (item COGS) plus selling, general, and administrative expenses (item XSGA). At the end of June of year t, we sort stocks into deciles based on Ol for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.5.9,"Ol q 1, Ol q 6, and Ol q 12",Quarterly Operating Leverage,Novy-Marx,2011,Intangibles,data_rawq['olq'] = (data_rawq['cogsq'] + data_rawq['xsgaq'])/data_rawq['atq'],,0,0,1,0,"At the beginning of each month t, we split stocks into deciles based on quarterly operating leverage, Olq, which is quarterly operating costs divided by assets (Compustat quarterly item ATQ) for the fiscal quarter ending at least four months ago. Operating costs are the cost of goods sold (item COGSQ) plus selling, general, and administrative expenses (item XSGAQ). We calculate decile returns for the current month t (Olq1), from month t to t + 5 (Olq6), and from month t to t + 11 (Olq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Olq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Olq6 decile. For sufficient data coverage, the Olq portfolios start in January 1972." +A.6.1,Me,the market equity,Banz,1981,Frictions,"# rawq['me'] +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) +data_rawq['me'] = data_rawq['me']/1000 # CRSP ME +data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) +data_rawq = data_rawq.dropna(subset=['me'])","#rawa['me'] +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) +data_rawa['me'] = data_rawa['me']/1000 # CRSP ME +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) +data_rawa = data_rawa.dropna(subset=['me']) +# rawq['me'] +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) +data_rawq['me'] = data_rawq['me']/1000 # CRSP ME +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) +data_rawq = data_rawq.dropna(subset=['me'])",1,1,1,0,"Market equity, Me, is price times shares outstanding from CRSP. At the end of June of each year t, we sort stocks into deciles based on the June-end Me. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.6.13,Dtv12,"dollar trading volume, 12-month holding period","Brennan, Chordia, and Subrahmanyam",1998,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on their average daily dollar trading volume, Dtv, over the prior six months from t-6 to t-1. We require a minimum of 50 daily observations. Dollar trading volume is share price times the number of shares traded. We adjust the trading volume of NASDAQ stocks per Gao and Ritter (2010) (see footnote 7). Monthly decile returns are calculated for the current month t (Dtv1), from month t to t+5 (Dtv6), and from month t to t + 11 (Dtv12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Dtv6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Dtv6 decile." +A.6.21,Isff1,"idiosyncratic skewness estimated from the Fama-French 3-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on idiosyncratic skewness, Isff, calculated as the skewness of the residuals from regressing a stock's excess return on the Fama- French three factors using daily observations from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Isff1), from month t to t + 5 (Isff6), and from month t to t + 11 (Isff12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in Isff6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Isff6 decile." +A.6.22,Isq1,"idiosyncratic skewness estimated from the q-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on idiosyncratic skewness, Isq, calculated as the skewness of the residuals from regressing a stock's excess return on the q-factors using daily observations from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Isq1), from month t to t + 5 (Isq6), and from month t to t + 11 (Isq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in Isq6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Isq6 decile. Because the q-factors start in January 1967, the Ivq portfolios start in February 1967." +A.6.24,Srev,short-term reversal,Jegadeesh,1990,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into short-term reversal (Srev) deciles based on the return in month t - 1. To be included in a decile in month t, a stock must have a valid price at the end of month t - 2 and a valid return for month t - 1. Monthly decile returns are calculated for the current month t, and the deciles are rebalanced at the beginning of month t + 1." +A.6.3,Ivff1,"idiosyncratic volatility estimated from the Fama-French 3-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,"Following Ang, Hodrick, Xing, and Zhang (2006), we calculate idiosyncratic volatility relative to the Fama-French three-factor model, Ivff, as the residual volatility from regressing a stock's excess returns on the Fama-French three factors. At the beginning of each month t, we sort stocks into deciles based on the Ivff estimated with daily returns from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Ivff1), from month t to t+5 (Ivff6), and from month t to t+11 (Ivff12), and the deciles are rebalanced at the beginning of month t + 1. The holding period that is longer than one month as in, for instance, Ivff6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Ivff6 decile." +A.6.5,Ivq1,"idiosyncratic volatility estimated from the q-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,"We calculate idiosyncratic volatility per the q-factor model, Ivq, as the residual volatility from regressing a stock's excess returns on the q-factors. At the beginning of each month t, we sort stocks into deciles based on the Ivq estimated with daily returns from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Ivq1), from month t to t + 5 (Ivq6), and from month t to t + 11 (Ivq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period that is longer than one month as in, for instance, Ivq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Ivq6 decile. Because the q-factors start in January 1967, the Ivq portfolios start in February 1967." +A.6.6,Tv1,"total volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,"Following Ang, Hodrick, Xing, and Zhang (2006), at the beginning of each month t, we sort stocks into deciles based on total volatility, Tv, estimated as the volatility of a stock's daily returns from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t, (Tv1), from month t to t + 5 (Tv6), and from month t to t + 11 (Tv12), and the deciles are rebalanced at the beginning of month t + 1. The holding period that is longer than one month as in, for instance, Tv6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdeciles returns as the monthly return of the Tv6 decile." +A.6.7,Sv1,"systematic volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,p119 +A.6.8,Beta1,Market Beta,Fama and MacBeth,1973,Frictions,,,,,,,p119 +,agr,Asset growth,"Cooper, Gulen & Schill",2008,,data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'],data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4),1,0,1,0,Annual percent change in total assets (at). +,baspread,Bid-ask spread rolling 3m,Amihud & Mendelson,1989,,,,,,,,Monthly average of daily bid-ask spread divided by average of daily spread. +,beta,Beta rolling 3m,Fama & MacBeth,1973,,,,,,,,Estimated market beta from weekly returns and equal weighted market returns for 3 years ending month t-1 with at least 52 weeks of returns. +,bm_ia,Industry-adjusted book to market,"Asness, Porter & Stevens",2000,,data_rawa['bm_ia'] = data_rawa['bm']/data_rawa['bm_ind'],"df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() +df_temp = df_temp.rename(columns={'bm': 'bm_ind'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49'])",0,1,0,0,Industry adjusted book-to-market ratio. +,cash,Cash holdings,Palazzo,2012,,data_rawq['cash'] = data_rawq['cheq']/data_rawq['atq'],,0,0,1,0,Cash and cash equivalents divided by average total assets. +,cashdebt,Cash flow to debt,Ou & Penman,1989,,"data_rawq['cashdebt'] = (ttm4('ibq', data_rawq) + ttm4('dpq', data_rawq))/((data_rawq['ltq']+data_rawq['ltq_l4'])/2)",data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4),1,0,1,0,Earnings before depreciation and extraordinary items (ib+dp) divided by avg. total liabilities (lt). +,chcsho,Change in shares outstanding,Pontiff & Woodgate,2008,,data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1,data_rawq['cshoq_l4'] = data_rawq.groupby(['permno'])['cshoq'].shift(4),1,0,1,0,Annual percent change in shares outstanding (csho). +,chpm(chpmia),Industry-adjusted change in profit margin,Soliman,2008,,data_rawq['chpm'] = (data_rawq['ibq4']/data_rawq['saleq4'])-(data_rawq['ibq4_l1']/data_rawq['saleq4_l1']),"data_rawq['ibq4'] = ttm4('ibq', data_rawq) +data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) +data_rawq['ibq4_l1'] = data_rawq.groupby(['permno'])['ibq4'].shift(1) +data_rawq['saleq4_l1'] = data_rawq.groupby(['permno'])['saleq4'].shift(1)",1,0,1,0,2-digit SIC - fiscal-year mean adjusted change in income before extraordinary items (ib) divided by sales (sale). +,chtx,Change in tax expense,Thomas & Zhang,2011,,data_rawq['chtx'] = (data_rawq['txtq']-data_rawq['txtq_l4'])/data_rawq['atq_l4'],"data_rawq['txtq_l4'] = data_rawq.groupby(['permno'])['txtq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4)",1,0,1,0,Percent change in total taxes (txtq) from quarter t-4 to t. +,cinvest,Corporate investment,"Titman, Wei & Xie",2004,,"* data_rawq['cinvest'] = ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / data_rawq['saleq'])\ + -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1))","data_rawq['ppentq_l1'] = data_rawq.groupby(['permno'])['ppentq'].shift(1) +data_rawq['ppentq_l2'] = data_rawq.groupby(['permno'])['ppentq'].shift(2) +data_rawq['ppentq_l3'] = data_rawq.groupby(['permno'])['ppentq'].shift(3) +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['saleq_l1'] = data_rawq.groupby(['permno'])['saleq'].shift(1) +data_rawq['saleq_l2'] = data_rawq.groupby(['permno'])['saleq'].shift(2) +data_rawq['saleq_l3'] = data_rawq.groupby(['permno'])['saleq'].shift(3) + +data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / data_rawq['saleq_l1'] +data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / data_rawq['saleq_l2'] +data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / data_rawq['saleq_l3'] + +* main formula + +data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / 0.01 +data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / 0.01 +data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / 0.01 + +data_rawq['cinvest'] = np.where(data_rawq['saleq']<=0, ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / 0.01) + -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)), data_rawq['cinvest']) + +data_rawq = data_rawq.drop(['c_temp1', 'c_temp2', 'c_temp3'], axis=1)",1,0,1,0,"Change over one quarter in net PP&E (ppentq) divided by sales (saleq) - average of this variable for prior 3 quarters; if saleq = 0, then scale by 0.01." +,depr,Depreciation / PP&E,Holthausen & Larcker,1992,,"data_rawq['depr'] = ttm4('dpq', data_rawq)/data_rawq['ppentq']",,0,0,1,0,Depreciation divided by PP&E. +,dolvol,Dollar trading volume,"Chordia, Subrahmanyam & Anshuman",2001,,"crsp_mom['dolvol'] = np.log(crsp_mom['vol_l2']*crsp_mom['prc_l2']).replace([np.inf, -np.inf], np.nan)",,1,0,0,0,Natural log of trading volume times price per share from month t-2. +,gma,Gross profitability,Novy-Marx,2013,,data_rawq['gma'] = (data_rawq['revtq4']-data_rawq['cogsq4'])/data_rawq['atq_l4'],"data_rawq['revtq4'] = ttm4('revtq', data_rawq) +data_rawq['cogsq4'] = ttm4('cogsq', data_rawq) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4)",1,0,1,0,Revenues (revt) minus cost of goods sold (cogs) divided by lagged total assets (at). +,grltnoa,Growth in long-term net operating assets,"Fairfield, Whisenant & Yohn",2003,,"data_rawq['grltnoa'] = ((data_rawq['rectq']+data_rawq['invtq']+data_rawq['ppentq']+data_rawq['acoq']+data_rawq['intanq']+ + data_rawq['aoq']-data_rawq['apq']-data_rawq['lcoq']-data_rawq['loq'])- + (data_rawq['rectq_l4']+data_rawq['invtq_l4']+data_rawq['ppentq_l4']+data_rawq['acoq_l4']-data_rawq['apq_l4']-data_rawq['lcoq_l4']-data_rawq['loq_l4'])-\ + (data_rawq['rectq']-data_rawq['rectq_l4']+data_rawq['invtq']-data_rawq['invtq_l4']+data_rawq['acoq']- + (data_rawq['apq']-data_rawq['apq_l4']+data_rawq['lcoq']-data_rawq['lcoq_l4'])- + ttm4('dpq', data_rawq)))/((data_rawq['atq']+data_rawq['atq_l4'])/2)","data_rawq['rectq_l4'] = data_rawq.groupby(['permno'])['rectq'].shift(4) +data_rawq['acoq_l4'] = data_rawq.groupby(['permno'])['acoq'].shift(4) +data_rawq['apq_l4'] = data_rawq.groupby(['permno'])['apq'].shift(4) +data_rawq['lcoq_l4'] = data_rawq.groupby(['permno'])['lcoq'].shift(4) +data_rawq['loq_l4'] = data_rawq.groupby(['permno'])['loq'].shift(4) +data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4)",1,0,1,0,Growth in long term net operating assets. +,herf,Industry sales concentration,Hou & Robinson,2006,,data_rawa['herf'] = (data_rawa['sale']/data_rawa['indsale'])*(data_rawa['sale']/data_rawa['indsale']),"data_rawa['sic'] = data_rawa['sic'].astype(int) +data_rawa['ffi49'] = ffi49(data_rawa) +data_rawa['ffi49'] = data_rawa['ffi49'].fillna(49) +data_rawa['ffi49'] = data_rawa['ffi49'].astype(int) +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['sale'].sum() +df_temp = df_temp.rename(columns={'sale': 'indsale'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +* main formula +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['herf'].sum() +data_rawa = data_rawa.drop(['herf'], axis=1) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49'])",0,1,0,0,2-digit SIC - fiscal-year sales concentration (sum of squared percent of sales in industry for each company). +,hire,Employee growth rate,"Bazdresch, Belo & Lin",2014,,"data_rawa['hire'] = (data_rawa['emp'] - data_rawa['emp_l1'])/data_rawa['emp_l1'] +data_rawa['hire'] = np.where((data_rawa['emp'].isnull()) | (data_rawa['emp_l1'].isnull()), 0, data_rawa['hire'])",data_rawa['emp_l1'] = data_rawa.groupby(['permno'])['emp'].shift(1),1,1,0,0,Percent change in number of employees (emp). +,ill,Illiquidity rolling 3m,Amihud,2002,,,,,,,,Average of daily (absolute return / dollar volume). +,lev,Leverage,Bhandari,1988,,data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'],* me from rawq,0,0,1,0,Total liabilities (lt) divided by fiscal year end market capitalization. +,lgr,Growth in long-term debt,"Richardson, Sloan, Soliman & Tuna",2005,,data_rawq['lgr'] = (data_rawq['ltq']/data_rawq['ltq_l4'])-1,data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4),1,0,1,0,Annual percent change in total liabilities (lt). +,maxret,Maximum daily returns rolling 3m,"Bali, Cakici & Whitelaw",2011,,,,,,,,Maximum daily return from returns during calendar month t-1. +,me_ia(mve_ia),Industry-adjusted size,"Asness, Porter & Stevens",2000,,data_rawa['me_ia'] = data_rawa['me']/data_rawa['me_ind'],"* me from rawa +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['me'].mean() +df_temp = df_temp.rename(columns={'me': 'me_ind'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49'])",1,1,0,0,2-digit SIC industry-adjusted fiscal year-end market capitalization. +,mom12m,Momentum rolling 12m,Jegadeesh,1990,,"crsp_mom['mom12m'] = mom(1, 12, crsp_mom)",* crsp_mom,1,0,0,0,11-month cumulative returns ending one month before month end. +,mom1m,Momentum ,Jegadeesh & Titman,1993,,crsp_mom['mom1m'] = crsp_mom['ret'],* crsp_mom,1,0,0,0,1-month cumulative return. +,mom36m,Momentum rolling 36m,Jegadeesh & Titman,1993,,"crsp_mom['mom36m'] = mom(1, 36, crsp_mom)",* crsp_mom,1,0,0,0,Cumulative returns from months t-36 to t-13. +,mom60m,Momentum rolling 60m,Jegadeesh & Titman,1993,,"crsp_mom['mom60m'] = mom(12, 60, crsp_mom)",* crsp_mom,1,0,0,0, +,mom6m,Momentum rolling 6m,Jegadeesh & Titman,1993,,"crsp_mom['mom6m'] = mom(1, 6, crsp_mom)",* crsp_mom,1,0,0,0,5-month cumulative returns ending one month before month end. +,nincr,Number of earnings increases,"Barth, Elliott & Finn",1999,,"data_rawq['nincr'] = (data_rawq['nincr_temp1'] + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']*data_rawq['nincr_temp8']))","data_rawq['ibq_l1'] = data_rawq.groupby(['permno'])['ibq'].shift(1) +data_rawq['ibq_l2'] = data_rawq.groupby(['permno'])['ibq'].shift(2) +data_rawq['ibq_l3'] = data_rawq.groupby(['permno'])['ibq'].shift(3) +data_rawq['ibq_l4'] = data_rawq.groupby(['permno'])['ibq'].shift(4) +data_rawq['ibq_l5'] = data_rawq.groupby(['permno'])['ibq'].shift(5) +data_rawq['ibq_l6'] = data_rawq.groupby(['permno'])['ibq'].shift(6) +data_rawq['ibq_l7'] = data_rawq.groupby(['permno'])['ibq'].shift(7) +data_rawq['ibq_l8'] = data_rawq.groupby(['permno'])['ibq'].shift(8) + +data_rawq['nincr_temp1'] = np.where(data_rawq['ibq'] > data_rawq['ibq_l1'], 1, 0) +data_rawq['nincr_temp2'] = np.where(data_rawq['ibq_l1'] > data_rawq['ibq_l2'], 1, 0) +data_rawq['nincr_temp3'] = np.where(data_rawq['ibq_l2'] > data_rawq['ibq_l3'], 1, 0) +data_rawq['nincr_temp4'] = np.where(data_rawq['ibq_l3'] > data_rawq['ibq_l4'], 1, 0) +data_rawq['nincr_temp5'] = np.where(data_rawq['ibq_l4'] > data_rawq['ibq_l5'], 1, 0) +data_rawq['nincr_temp6'] = np.where(data_rawq['ibq_l5'] > data_rawq['ibq_l6'], 1, 0) +data_rawq['nincr_temp7'] = np.where(data_rawq['ibq_l6'] > data_rawq['ibq_l7'], 1, 0) +data_rawq['nincr_temp8'] = np.where(data_rawq['ibq_l7'] > data_rawq['ibq_l8'], 1, 0) + +*main formula + +data_rawq = data_rawq.drop(['ibq_l1', 'ibq_l2', 'ibq_l3', 'ibq_l4', 'ibq_l5', 'ibq_l6', 'ibq_l7', 'ibq_l8', 'nincr_temp1', + 'nincr_temp2', 'nincr_temp3', 'nincr_temp4', 'nincr_temp5', 'nincr_temp6', 'nincr_temp7', + 'nincr_temp8'], axis=1)",1,0,1,0,Number of consecutive quarters (up to eight quarters) with an increase in earnings (ibq) over same quarter in the prior year. +,op(operprof),Operating profitability,Fama and French,2015,,,,,,,, +,pscore(ps),Performance Score,Piotroski,2000,,"data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps'])",#(pstkrv prior to pstkl prior to pstk),0,1,0,0,Sum of 9 indicator variables to form fundamental health score. +,rd_sale,R&D to sales,"Guo, Lev & Shi",2006,,data_rawq['rd_sale'] = data_rawq['xrdq4']/data_rawq['saleq4'],"data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) +data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4']) +data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4'])",0,0,1,0,R&D expense divided by sales (xrd/sale). +,re,Revisions in analysts’ earnings forecasts,"Chan, Jegadeesh, and Lakonishok",1996,,,,,,,, +,rsup,Revenue surprise,Kama,2009,,data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'],data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4),1,0,1,0,Sales from quarter t minus sales from quarter t-4 (saleq) divided by fiscal-quarter- end market capitalization (cshoq * prccq). +,rvar_capm,Residual variance - CAPM rolling 3m,Daily Stock residual variance of CAPM,,,,,,,,, +,rvar_ff3,Residual variance - ff3 rolling 3m,Daily Stock residual variance of Fama French 3 factors,,,,,,,,, +,rvar_mean,return variance rolling 3m,Daily Stock return variance,,,,,,,,, +,sgr,Sales growth,"Lakonishok, Shleifer & Vishny",1994,,data_rawq['sgr'] = (data_rawq['saleq4']/data_rawq['saleq4_l4'])-1,"data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) + +data_rawq['saleq4_l4'] = data_rawq.groupby(['permno'])['saleq4'].shift(4)",1,0,1,0,Annual percent change in sales (sale). +,std_dolvol,Std of dollar trading volume rolling 3m,"Chordia, Subrahmanyam & Anshuman",2001,,,,,,,,Monthly standard deviation of daily dollar trading volume. +,std_turn,Std. of Share turnover rolling 3m,"Chordia, Subrahmanyam, &Anshuman",2001,,,,,,,,Monthly standard deviation of daily share turnover. +,sue,Unexpected quarterly earnings,"Rendelman, Jones & Latane",1982,,,,,,,,"Unexpected quarterly earnings divided by fiscal-quarter-end market cap. Unexpected earnings is I/B/E/S actual earnings minus median forecasted earnings if available, else it is the seasonally differenced quarterly earnings before extraordinary items from Compustat quarterly file." +,turn,Shares turnover,"Datar, Naik & Radcliffe",1998,,,,,,,,Average monthly trading volume for most recent 3 months scaled by number of shares outstanding in current month. +,zerotrade,Number of zero-trading days rolling 3m,Liu,2006,,,,,,,,Turnover weighted number of zero trading days for most recent 1 month. diff --git a/Chars_Description_1011.csv b/Chars_Description_1011.csv deleted file mode 100755 index 18af30e..0000000 --- a/Chars_Description_1011.csv +++ /dev/null @@ -1,125 +0,0 @@ -Num,Acronym,Description,Author,Pub Year,Category -A.2.1,Bm,Book-to-market equity,"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth -A.2.2,Bmj,Book-to-June-end market equity, Asness and Frazzini ,2013,Value-versus-growth -A.2.3,Bmq12,Quarterly Book-to-market Equity (12-month holding period),"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth -A.2.9,Ep,Earnings-to-price,Basu,1983,Value-versus-growth -A.2.10,"Ep q 1, Ep q 6, and Ep q 12","Q Quarterly Earnings-to-price(1-month holding period), uarterly Earnings-to-price(6-month holding period), Quarterly Earnings-to-price(12-month holding period), ",Basu,1983,Value-versus-growth -A.2.12,Cp,Cash flow-to-price,"Lakonishok, Shleifer, and Vishny ",1994,Value-versus-growth -A.2.13,"Cpq1, Cpq6, Cpq12","Quarterly Cash Flow-to-price (1-month holding period), Quarterly Cash Flow-to-price (6-month holding period), Quarterly Cash Flow-to-price (12-month holding period)","Lakonishok, Shleifer, and Vishny",1994,Value-versus-growth -A.2.14,Dp(dy),Dividend yield,Litzenberger and Ramaswamy,1979,Value-versus-growth -A.2.16,Op and Nop, (Net) Payout Yield,"Richardson, and Roberts",2007,Value-versus-growth -A.2.20,Em,Enterprise multiple,Loughran and Wellman,2011,Value-versus-growth -A.2.21,"Emq1, Emq6","Quarterly Enterprise multiple (1-month holding period), Quarterly Enterprise multiple (6-month holding period)",Loughran and Wellman,2011,Value-versus-growth -A.2.22,Sp,Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth -A.2.23,"Sp q 1, Sp q 6, and Sp q 12",Quarterly Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth -A.2.24,Ocp,Operating Cash Flow-to-price,"Desai, Rajgopal, and Venkatachalam",2004,Value-versus-growth -A.2.26,Ir,Intangible Return,Daniel and Titman,2006,Value-versus-growth -A.2.28,Ebp,Enterprise Book-to-price,"Penman, Richardson, and Tuna",2007,Value-versus-growth -A.3.1,Aci,Abnormal Corporate Investment,"Titman, Wei, and Xie",2004,Investment -A.3.2,I/A, Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment -A.3.3,"Ia q 6, and Ia q 12",Quarterly Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment -A.3.4,dPia,Changes in PPE and Inventory-to-assets,"Lyandres, Sun, and Zhang",2008,Investment -A.3.5,Noa and dNoa,(Changes in) Net Operating Assets,"Hirshleifer, Hou, Teoh, and Zhang",2004,Investment -A.3.6,dLno,Changes in Long-term Net Operating Assets,"Fairfield, Whisenant, and Yohn",2003,Investment -A.3.7,Ig,Investment Growth,Xing,2008,Investment -A.3.8,2Ig,2-year Investment Growth,Anderson and Garcia-Feijoo,2006,Investment -A.3.10,Nsi,Net Stock Issues,Pontiff and Woodgate,2008,Investment -A.3.11,dIi,% Change in Investment - % Change in Industry Investment,Abarbanell and Bushee,1998,Investment -A.3.14,Ivg,Inventory Growth,Belo and Lin,2011,Investment -A.3.15,Ivc,Inventory Changes,Thomas and Zhang,2002,Investment -A.3.16,Oa(acc),Operating Accruals,Sloan,1996,Investment -A.3.17,Ta,Total Accruals,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.18,dCoa,changes in Current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.19,dNca,changes in Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.19,dNco,Changes in Net Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.20,dFin,Changes in Net Financial Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.20,dFnl,changes in Financial Liabilities,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.20,dBe,changes in Book Equity,"Richardson, Sloan, Soliman, and Tuna",2005,Investment -A.3.22,Poa,Percent operating accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment -A.3.23,Pta,Percent total accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment -A.3.24,Pda,Percent discretionary accruals,,,Investment -A.3.25,Ndf,Net debt finance,"Bradshaw, Richardson, and Sloan",2006,Investment -A.4.1,"Roe1, Roe6",Return on Equity,"Hou, Xue, and Zhang",2015,Profitability -A.4.2,"dRoe1, dRoe6, and dRoe12",Changes in Return on Equity,"Hou, Xue, and Zhang",2015,Profitability -A.4.3,Roa1,Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability -A.4.4,"dRoa1, dRoa6",Changes in Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability -A.4.5,Ato,Asset Turnover,Soliman,2008,Profitability -A.4.6,Cto,Capital Turnover,Haugen and Baker,1996,Profitability -A.4.7,"Rna q 1, Rna q 6, Atoq1","Quarterly Return on Net Operating Assets, Quarterly Asset Turnover",Soliman,2008,Profitability -A.4.8,"Cto q1, Cto q6,",Quarterly Capital Turnover,Haugen and Baker,1996,Profitability -A.4.9,Gpa,Gross Profits-to-assets,Novy-Marx,2013,Profitability -A.4.11,"Gla q 1, Gla q 6, and Gla q 12",Quarterly Gross Profits-to-lagged Assets,,,Profitability -A.4.12,Ope,Operating Profits to Equity,Fama and French,2015,Profitability -A.4.14,"Ole q 1, Ole q 6 ",Quarterly Operating Profits-to-lagged Equity,,,Profitability -A.4.15,Opa,Operating Profits-to-assets,"Linnainmaa, and Nikolaev",2015,Profitability -A.4.17,"Ola q 1, Ola q 6, and Ola q 12",Quarterly Operating Profits-to-lagged Assets,,,Profitability -A.4.18,Cop,Cash-based Operating Profitability,"Gerakos, Linnainmaa, and Nikolaev",2016,Profitability -A.4.19,Cla,Cash-based Operating Profits-to-lagged Assets,,,Profitability -A.4.20,Claq,Quarterly Cash-based Operating Profits-to-lagged Assets,,,Profitability -A.4.29,Tbi q 12,Quarterly Taxable Income-to-book Income,"Green, Hand, and Zhang",2013,Profitability -A.5.1,Oca and Ioca,(Industry-adjusted) Organizational Capital-to-assets,Eisfeldt and Papanikolaou,2013,Intangibles -A.5.2,Adm,Advertising Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles -A.5.4,Rdm,R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles -A.5.5,"Rdm q 1, Rdm q 6, and Rdm q 12",Quarterly R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles -A.5.6,Rds q 6 and Rds q 12,Quarterly R&D Expense-to-sales,"Chan, Lakonishok, and Sougiannis",2001,Intangibles -A.5.8,Ol,Operating Leverage,Novy-Marx,2011,Intangibles -A.5.9,"Ol q 1, Ol q 6, and Ol q 12",Quarterly Operating Leverage,Novy-Marx,2011,Intangibles -A.5.11,Rca,R&D Capital-to-assets,Li,2011,Intangibles -A.5.24,Etr,Effective Tax Rate,Abarbanell and Bushee,1998,Intangibles -A.5.46,"Alm q 1, Alm q 6, and Alm q 12",Quarterly Asset Liquidity,Ortiz-Molina and Phillips,2014,Intangibles -A.5.50,"R a 1 , R n 1 , R a [2,5] , R n[2,5] , R a[6,10] , R n[6,10] , R a[11,15] , and R a[16,20]",Seasonality,Heston and Sadka,2008,Intangibles -A.6.8,Beta1,Market Beta,Fama and MacBeth,1973,Frictions -A.6.13,Dtv12,"dollar trading volume, 12-month holding period","Brennan, Chordia, and Subrahmanyam",1998,Frictions -A.6.21,Isff1,"idiosyncratic skewness estimated from the Fama-French 3-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions -A.6.22,Isq1,"idiosyncratic skewness estimated from the q-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions -A.6.3,Ivff1,"idiosyncratic volatility estimated from the Fama-French 3-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions -A.6.5,Ivq1,"idiosyncratic volatility estimated from the q-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions -A.6.1,Me,the market equity,Banz,1981,Frictions -A.6.24,Srev,short-term reversal,Jegadeesh,1990,Frictions -A.6.7,Sv1,"systematic volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions -A.6.6,Tv1,"total volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions -A.1.2,Abr1,"cumulative abnormal returns around earnings announcement dates, 1-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum -A.1.2,Abr6,"cumulative abnormal returns around earnings announcement dates, 6-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum -A.1.2,Abr12,"cumulative abnormal returns around earnings announcement dates, 12-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum -,agr,Asset growth,"Cooper, Gulen & Schill",2008, -,baspread,Bid-ask spread rolling 3m,Amihud & Mendelson,1989, -,beta,Beta rolling 3m,Fama & MacBeth,1973, -,bm_ia,Industry-adjusted book to market,"Asness, Porter & Stevens",2000, -,cash,Cash holdings,Palazzo,2012, -,cashdebt,Cash flow to debt,Ou & Penman,1989, -,chcsho,Change in shares outstanding,Pontiff & Woodgate,2008, -,chpm(chpmia),Industry-adjusted change in profit margin,Soliman,2008, -,chtx,Change in tax expense,Thomas & Zhang,2011, -,cinvest,Corporate investment,"Titman, Wei & Xie",2004, -,depr,Depreciation / PP&E,Holthausen & Larcker,1992, -,dolvol,Dollar trading volume,"Chordia, Subrahmanyam & Anshuman",2001, -,gma,Gross profitability,Novy-Marx,2013, -,grltnoa,Growth in long-term net operating assets,"Fairfield, Whisenant & Yohn",2003, -,herf,Industry sales concentration,Hou & Robinson,2006, -,hire,Employee growth rate,"Bazdresch, Belo & Lin",2014, -,ill,Illiquidity rolling 3m,Amihud,2002, -,lev,Leverage,Bhandari,1988, -,lgr,Growth in long-term debt,"Richardson, Sloan, Soliman & Tuna",2005, -,maxret,Maximum daily returns rolling 3m,"Bali, Cakici & Whitelaw",2011, -,me_ia(mve_ia),Industry-adjusted size,"Asness, Porter & Stevens",2000, -,mom12m,Momentum rolling 12m,Jegadeesh,1990, -,mom1m,Momentum ,Jegadeesh & Titman,1993, -,mom36m,Momentum rolling 36m,Jegadeesh & Titman,1993, -,mom60m,Momentum rolling 60m,Jegadeesh & Titman,1993, -,mom6m,Momentum rolling 6m,Jegadeesh & Titman,1993, -,nincr,Number of earnings increases,"Barth, Elliott & Finn",1999, -,op(operprof),Operating profitability,Fama and French,2015, -A.4.5,pm,profit margin,Soliman,2008,Profitability -,pscore(ps),Performance Score,Piotroski,2000, -,rd_sale,R&D to sales,"Guo, Lev & Shi",2006, -,re,Revisions in analysts’ earnings forecasts,"Chan, Jegadeesh, and Lakonishok",1996, -,rsup,Revenue surprise,Kama,2009, -,rvar_capm,Residual variance - CAPM rolling 3m,Daily Stock residual variance of CAPM,, -,rvar_ff3,Residual variance - ff3 rolling 3m,Daily Stock residual variance of Fama French 3 factors,, -,rvar_mean,return variance rolling 3m,Daily Stock return variance,, -,sgr,Sales growth,"Lakonishok, Shleifer & Vishny",1994, -,std_dolvol,Std of dollar trading volume rolling 3m,"Chordia, Subrahmanyam & Anshuman",2001, -,std_turn,Std. of Share turnover rolling 3m,"Chordia, Subrahmanyam, &Anshuman",2001, -,sue,Unexpected quarterly earnings,"Rendelman, Jones & Latane",1982, -,turn,Shares turnover,"Datar, Naik & Radcliffe",1998, -,zerotrade,Number of zero-trading days rolling 3m,Liu,2006, \ No newline at end of file diff --git a/char60/.DS_Store b/char60/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 1 Mar 2021 20:23:22 +0800 Subject: [PATCH 10/15] update 0301 --- .DS_Store | Bin 12292 -> 10244 bytes README.md | 90 ++++- _config.yml | 1 - char60/.DS_Store | Bin 6148 -> 6148 bytes char60/Chars60_description.csv | 61 --- char60/abr.py | 236 ++++++++++++ char60/accounting_60.py | 512 +++----------------------- char60/beta.py | 0 char60/bid_ask_spread.py | 0 char60/functions.py | 61 +-- {py-iclink => char60}/iclink.py | 1 - char60/ill.py | 0 char60/impute_rank_output_bchmk_60.py | 164 +++++++-- char60/maxret_d.py | 0 char60/merge_chars_60.py | 149 ++++++++ char60/pkl_to_csv.py | 29 ++ char60/re.py | 120 ++++++ char60/rvar_capm.py | 168 +++++++++ char60/rvar_ff3.py | 201 ++++++++++ char60/rvar_mean.py | 150 ++++++++ char60/std_dolvol.py | 0 char60/std_turn.py | 0 char60/sue.py | 106 ++++++ char60/zerotrade.py | 0 py-dgtw/.DS_Store | Bin py-dgtw/dgtw.py | 0 py-ff3/ff3.py | 0 py-iclink/.DS_Store | Bin 6148 -> 0 bytes py-pead/pead.py | 0 pychars/.DS_Store | Bin pychars/accounting.py | 53 ++- pychars/beta.py | 0 pychars/functions.py | 2 +- pychars/hxz_abr.py | 0 pychars/hxz_re.py | 0 pychars/hxz_sue.py | 0 pychars/iclink.py | 0 pychars/impute_rank_output.py | 0 pychars/merge_chars.py | 0 pychars/rvar_capm.py | 0 pychars/rvar_ff3.py | 0 pychars/rvar_mean.py | 0 qsub/.DS_Store | Bin qsub/check_crsp.sas | 0 qsub/submit.sh | 0 setup-wrds.py | 0 46 files changed, 1475 insertions(+), 629 deletions(-) mode change 100644 => 100755 README.md delete mode 100644 _config.yml mode change 100644 => 100755 char60/.DS_Store delete mode 100644 char60/Chars60_description.csv create mode 100755 char60/abr.py mode change 100644 => 100755 char60/accounting_60.py mode change 100644 => 100755 char60/beta.py mode change 100644 => 100755 char60/bid_ask_spread.py mode change 100644 => 100755 char60/functions.py rename {py-iclink => char60}/iclink.py (99%) mode change 100644 => 100755 mode change 100644 => 100755 char60/ill.py mode change 100644 => 100755 char60/impute_rank_output_bchmk_60.py mode change 100644 => 100755 char60/maxret_d.py mode change 100644 => 100755 char60/merge_chars_60.py create mode 100755 char60/pkl_to_csv.py create mode 100755 char60/re.py create mode 100755 char60/rvar_capm.py create mode 100755 char60/rvar_ff3.py create mode 100755 char60/rvar_mean.py mode change 100644 => 100755 char60/std_dolvol.py mode change 100644 => 100755 char60/std_turn.py create mode 100755 char60/sue.py mode change 100644 => 100755 char60/zerotrade.py mode change 100644 => 100755 py-dgtw/.DS_Store mode change 100644 => 100755 py-dgtw/dgtw.py mode change 100644 => 100755 py-ff3/ff3.py delete mode 100644 py-iclink/.DS_Store mode change 100644 => 100755 py-pead/pead.py mode change 100644 => 100755 pychars/.DS_Store mode change 100644 => 100755 pychars/accounting.py mode change 100644 => 100755 pychars/beta.py mode change 100644 => 100755 pychars/functions.py mode change 100644 => 100755 pychars/hxz_abr.py mode change 100644 => 100755 pychars/hxz_re.py mode change 100644 => 100755 pychars/hxz_sue.py mode change 100644 => 100755 pychars/iclink.py mode change 100644 => 100755 pychars/impute_rank_output.py mode change 100644 => 100755 pychars/merge_chars.py mode change 100644 => 100755 pychars/rvar_capm.py mode change 100644 => 100755 pychars/rvar_ff3.py mode change 100644 => 100755 pychars/rvar_mean.py mode change 100644 => 100755 qsub/.DS_Store mode change 100644 => 100755 qsub/check_crsp.sas mode change 100644 => 100755 qsub/submit.sh mode change 100644 => 100755 setup-wrds.py diff --git a/.DS_Store b/.DS_Store index 7329c78c8bf4823b0c71684345a5a253510ed259..d9bba7c004d819690d037ce9b0e41b3a7d588d0f 100644 GIT binary patch delta 230 zcmZokXbF&DU|?W$DortDU{C-uIe-{M3-C-V6q~3gIoUvmMH0vd3o-)585qnM43f%= z3zBm3lQtf7W1n~-b~8H%3kRdZC$Gf(E% d(Pd6+XvtoVj^46XV1ir;k+~6Q^tvuN}XVrmXF?6Vs$E@heSBo!y@9cVZ zb`~m9rKZeCckawR=f2MQX3lr#4gla_QHuk#1Asz_=jvuUJq8fnM7ihSa-OP=byR|s zdMMGg3a}3}m<0zGM$Ibi>Vi5d;`Qq(ZGb#1dh1)6v}vVDLe0Xnl-gIs0kE<{ci|CQ zAhbYefzSe>1wsq_cNXBWmkz}j?`3It4lNK`;Q!hJ{QVH6#PfukFUwrKI_ShHJeP9{ zFAloqgq@c~-8|vu%Q7bk9Vn|(E~}!u#6Ve{%A>Ac!p)auE~^8Io~a$TXLMI6D0-*z zs89z?$UHoU76>g+ZGpX1Bc~TcWkb_R<-sWdj1Yx|MM3x)c`jJ zXnCFa*oEDXz9yX+sJZIWf3F$f*nswY?k&9^@}NTsG@onV@VVBfy~f=7mcLeAqXf9r zX9%x5leaef`i&n}yjITS^;kK^%m&D49t-O3t(5eur3wo!m{=nb74(?!x&?t){f>#}O}UDD?_g~FERi17cDvet zfJG+ER9ZJxjmdtx4zkG9yquDHw_DiP$RekRuw-c}wp(`Rkx1kr-tF;}?cAH!b2OwM zyL}JPknZQQ0ZrorQd$#ymxqqlM2Eak`KoIt#NaL%fzuSTa(8MCn;{q<@$M7zm6}+an+S*+%b7yI^(zL zNO&(S_P=QI1=EI=n$wW zp+dm*Dxq{|%Pw|H3B-PIPY}-Z^9qNxxf zl2@o{fSWaS3+0=ssdpxxC?K=cN5C?;vE>nZtTH6LK?@}Fpl6cd_SJVahxEu zoguVkP{SFtFpmYa33U(Shwu@66hDlY@NpM?Py6t8w1hO7;}@&(=8ayT{MohRzm;Uf zhe~hss;yNsdX>@bi(d6`4s0kwx-xFTVH)SBO8cX9UDjJ;5$i1@9pwr748-6h48s5< z;RNOTA?s&pVk_Besx4$towlrtMBVq%>#smt*f!Oq?dVD)i&dAmro_nq_?hqdZ-Qxh z_gnP#pP@C&EKPoq3};qOUtr9;=&VsI@1&0xZKpu(4#OC=WRr1qYVQC{QXeK6I!pI9 z-6x5IIg}2-gm41!F}ltZH`9dFqESih9O-e{Q5v5E$M_RKcf<5F6h+A*47rRhMRj<8 z1uEw~^OYE{S#y>&-h15f)|e61ZJ<^pW9)&BO1mIm(RpD}p}WSOzwXYSSTx?>d)%!cvdD`o$s#YQB#XSnM;3We6=AtS!Ab0X_*pC4Ad>H@TkDhDddr=tw LX-#?KpS=gKA$K2X4+sE z%GD6lA?@t8^UcrKLMaic_OyRO)F7f78tcj~I*0H)wk^m@*mD zGcl^@(Jd@_=8TS=zL_c8cL4i@o~7CeaL#(^931ak**Ae20COYxaMT2E4+*0yRZ`t_Zxp?KQ5`tE^Ur40-SL-8kOW z%8mgyYalqYS!#jSzV{4x20R0E2Ke_OL}MKpU1%*o9q8l=0A0bb2%h;b1!F>^Bcltg zF#}N=3az2aTrre}V>~o=k?>z&afieS|`nAF5 z|HbP2e;MT8JOiGAf5m{RbgnvWlw{7AjHu~2NHo+1YW5HK<@2y7PQ5M$Y_z%h$?Gdl-A2T%b}u84%9H(hI5x+KtY8KJ$3PAT diff --git a/char60/Chars60_description.csv b/char60/Chars60_description.csv deleted file mode 100644 index 45fa2c2..0000000 --- a/char60/Chars60_description.csv +++ /dev/null @@ -1,61 +0,0 @@ -Chars,Name,Description -abr,Cumulative Abnormal Returns Around Earnings Announcement Dates,"We calculate cumulative abnormal stock return (Abr) around the latest quarterly earnings announcement date (Compustat quarterly item RDQ) (Chan, Jegadeesh, and Lakonishok 1996)): Abr_i = \sum_{d=-2}^{+1}r_{id}-r{md}, in which r id is stock i’s return on day d (with the earnings announced on day 0) and r md is the market index return. We cumulate returns until one (trading) day after the announcement date to account for the one-day-delayed reaction to earnings news. r md is the value-weighted market return for the Abr deciles with NYSE breakpoints and value-weighted returns, but is the equal-weighted market return with all-but-micro breakpoints and equal-weighted returns." -acc,Working capital accruals,Annual income before extraordinary items (ib) minus operating cash flows (oancf ) divided by average total assets (at); if oancf is missing then set to change in act change in che - change in lct + change in dlc + change in txp-dp -adm,Advertising Expense-to-market,"At the end of June of each year t, we sort stocks into deciles based on advertising expenses-tomarket, Adm, which is advertising expenses (Compustat annual item XAD) for the fiscal year ending in calendar year t − 1 divided by the market equity (from CRSP) at the end of December of t − 1. For firms with more than one share class, we merge the market equity for all share classes before computing Adm. We keep only firms with positive advertising expenses. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because sufficient XAD data start in 1972, the Adm portfolios start in July 1973." -agr,Asset growth,Annual percent change in total assets (at) -alm,Asset liquidity (scaled by market assets),"Following Ortiz-Molina and Phillips (2014), we measure asset liquidity as cash + 0.75 × noncash current assets + 0.50 × tangible fixed assets. Cash is cash and short-term investments (Compustat annual item CHE). Noncash current assets is current assets (item ACT) minus cash. Tangible fixed assets is total assets (item AT) minus current assets (item ACT), minus goodwill (item GDWL, zero if missing), and minus intangibles (item INTAN, zero if missing). Alm is asset liquidity scaled by one-year-lagged market value of assets. Market value of assets is total assets plus market equity (item PRCC F times item CSHO) minus book equity (item CEQ)." -ato,Asset turnover,"Noa is operating assets minus operating liabilities. Operating assets are total assets (item ATQ) minus cash and short-term investments (item CHEQ), and minus other investment and advances (item IVAOQ, zero if missing). Operating liabilities are total assets minus debt in current liabilities (item DLCQ, zero if missing), minus long-term debt (item DLTTQ, zero if missing), minus minority interests (item MIBQ, zero if missing), minus preferred stocks (item PSTKQ, zero if missing), and minus common equity (item CEQQ). Quarterly profit margin, Pm q , is quarterly operating income after depreciation divided by quarterly sales (item SALEQ). Quarterly asset turnover, Ato q , is quarterly sales divided by one-quarter-lagged Noa." -baspread,Bid-ask spread rolling 3m,Monthly average of daily bid-ask spread divided by average of daily spread -beta,Beta rolling 3m,Estimated market beta from daily returns -bm,Book-to-market,Book value of equity (ceq) divided by end of fiscal year-end market capitalization -bm_ia,Industry-adjusted book to market,Industry adjusted book-to-market ratio -cash,Cash holdings,Cash and cash equivalents divided by average total assets -cashdebt,Cash flow to debt,Earnings before depreciation and extraordinary items (ib+dp) divided by avg. total liabilities (lt) -cfp,Cash-flow-to-price ratio,Operating cash flows divided by fiscal-year-end market capitalization -chcsho,Change in shares outstanding,Annual percent change in shares outstanding (csho) -chpm,Industry-adjusted change in profit margin,2-digit SIC - fiscal-year mean adjusted change in income before extraordinary items (ib) divided by sales (sale) -chtx,Change in tax expense,Percent change in total taxes (txtq) from quartert-4 to t -cinvest,Corporate investment,"Change over one quarter in net PP&E (ppentq) divided by sales (saleq) - average of this variable for prior 3 quarters; if saleq = 0, then scale by 0.01" -depr,Depreciation / PP&E,Depreciation divided by PP&E -dolvol,Dollar trading volume,Natural log of trading volume times price per share from month t-2 -dy,Dividend to price,Total dividends (dvt) divided by market capitalization at fiscal year-end -ep,Earnings to price,Annual income before extraordinary items (ib) divided by end of fiscal year market cap -gma,Gross profitability,Revenues (revt) minus cost of goods sold (cogs) divided by lagged total assets (at) -grltnoa,Growth in long-term net operating assets,Growth in long-term net operating assets -herf,Industry sales concentration,2-digit SIC - fiscal-year sales concentration (sum of squared percent of sales in industry for each company). -hire,Employee growth rate,Percent change in number of employees (emp) -ill,Illiquidity rolling 3m,Average of daily (absolute return / dollar volume). -lev,Leverage,Total liabilities (lt) divided by fiscal year-end market capitalization -lgr,Growth in long-term debt,Annual percent change in total liabilities (lt) -maxret,Maximum daily returns rolling 3m,Maximum daily return from returns during calendar montht-1 -me_ia,Industry-adjusted size,2-digit SIC industry-adjusted fiscal year-end market capitalization -mom12m,Momentum rolling 12m,11-month cumulative returns ending one month before month end -mom1m,Momentum ,1-month cumulative return -mom36m,Momentum rolling 36m,Cumulative returns from monthst-36 to t-13 -mom60m,Momentum rolling 60m,Cumulative returns from monthst-60 to t-13 -mom6m,Momentum rolling 6m,5-month cumulative returns ending one month before month end -ni,Net stock issues,"At the end of June of year t, we measure net stock issues, Nsi, as the natural log of the ratio of the split-adjusted shares outstanding at the fiscal year ending in calendar year t−1 to the split-adjusted shares outstanding at the fiscal year ending in t−2. The split-adjusted shares outstanding is shares outstanding (Compustat annual item CSHO) times the adjustment factor (item AJEX)." -nincr,Number of earnings increases,Number of consecutive quarters (up to eight quarters) with an increase in earnings (ibq) over same quarter in the prior year -noa,Net operating assets,"Following Hirshleifer, Hou, Teoh, and Zhang (2004), we measure net operating assets as operating assets minus operating liabilities. Operating assets are total assets (Compustat annual item AT) minus cash and short-term investment (item CHE). Operating liabilities are total assets minus debt included in current liabilities (item DLC, zero if missing), minus long-term debt (item DLTT, zero if missing), minus minority interests (item MIB, zero if missing), minus preferred stocks (item PSTK, zero if missing), and minus common equity (item CEQ). Noa is net operating assets scalded by one-year-lagged total assets." -op,Operating profitability,"Following Fama and French (2015), we measure operating profitability to equity, Ope, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS, zero if missing), minus selling, general, and administrative expenses (item XSGA, zero if missing), and minus interest expense (item XINT, zero if missing), scaled by book equity (the denominator is current, not lagged, book equity). We require at least one of the three expense items (COGS, XSGA, and XINT) to be non-missing. Book equity is stockholders’ book equity, plus balance sheet deferred taxes and investment tax credit (item TXDITC) if available, minus the book value of preferred stock. Stockholders’ equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders’ equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depending on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock." -pctacc,Percent accruals,Same as acc except that the numerator is divided by the absolute value of ib; if ib = 0 then ib set to 0.01 for denominator. -pm,profit margin,"Soliman (2008) use DuPont analysis to decompose Roe as Rna + FLEV × SPREAD, in which Roe is return on equity, Rna is return on net operating assets, FLEV is financial leverage, and SPREAD is the difference between return on net operating assets and borrowing costs. We can further decompose Rna as Pm × Ato, in which Pm is profit margin and Ato is asset turnover. Pm is operating income after depreciation divided by sales (item SALE) for the fiscal year ending in calendar year t − 1." -pscore,Performance Score,Sum of 9 indicator variables to form fundamental health score -rd_sale,R&D to sales,R&D expense divided by sales (xrd/sale) -rdm,R&D to market capitalization,R&D expense divided by end-of-fiscal-year market capitalization -re,Revisions in analysts’ earnings forecasts,"Following Chan, Jegadeesh, and Lakonishok (1996), we measure earnings surprise as the revisions in analysts’ forecasts of earnings obtained from the Institutional Brokers’ Estimate System (IBES). Because analysts’ forecasts are not necessarily revised each month, we construct a six-month moving average of past changes in analysts’ forecasts: RE_{it}=\sum_{\iota=1}^6\frac{" -rna,return on net operating assets,"Following Soliman (2008), we use annual sorts to form Rna, Pm, and Ato deciles. At the end of June of year t, we measure Rna as operating income after depreciation (Compustat annual item OIADP) for the fiscal year ending in calendar year t − 1 divided by net operating assets (Noa) for the fiscal year ending in t − 2." -roa,Return on assets,"Return on assets, Roa, is income before extraordinary items (Compustat quarterly item IBQ) divided by one-quarter-lagged total assets (item ATQ). At the beginning of each month t, we sort all stocks into deciles based on Roa computed with quarterly earnings from the most recent earnings announcement dates (item RDQ). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent Roa to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end." -roe,Return on equity,"Return on equity, Roe, is income before extraordinary items (Compustat quarterly item IBQ) divided by one-quarter-lagged book equity (Hou, Xue, and Zhang 2015). Book equity is shareholders’ equity, plus balance sheet deferred taxes and investment tax credit (item TXDITCQ) if available, minus the book value of preferred stock (item PSTKQ). Depending on availability, we use stockholders’ equity (item SEQQ), or common equity (item CEQQ) plus the book value of preferred stock, or total assets (item ATQ) minus total liabilities (item LTQ) in that order as shareholders’ equity." -rsup,Revenue surprise,Sales from quarter t minus sales from quarter t-4 (saleq) divided by fiscal-quarter-end market capitalization (cshoq * prccq) -rvar_capm,Residual variance - CAPM rolling 3m,Daily Stock residual variance of CAPM -rvar_ff3,Residual variance - ff3 rolling 3m,Daily Stock residual variance of Fama French 3 factors -rvar_mean,return variance rolling 3m,Daily Stock return variance -seas1a,Seasonality,Monthly Stock Return Lag 11 Months -sgr,Sales growth,Annual percent change in sales (sale) -sp,Sales to price,Annual revenue (sale) divided by fiscal year-end market capitalization -std_dolvol,Std of dollar trading volume rolling 3m,Monthly standard deviation of daily dollar trading volume -std_turn,Std. of Share turnover rolling 3m,Monthly standard deviation of daily share turnover -sue,Unexpected quarterly earnings,"Unexpected quarterly earnings divided by fiscal-quarter-end market cap. Unexpected earnings is I/B/E/S actual earnings minus median forecasted earnings if available, else it is the seasonally differenced quarterly earnings before extraordinary items from Compustat quarterly file" -turn,Shares turnover,Average monthly trading volume for most recent 3 months scaled by number of shares outstanding in current month -zerotrade,Number of zero-trading days rolling 3m,Turnover weighted number of zero trading days for most recent 1 month \ No newline at end of file diff --git a/char60/abr.py b/char60/abr.py new file mode 100755 index 0000000..ecb5219 --- /dev/null +++ b/char60/abr.py @@ -0,0 +1,236 @@ +# Calculate HSZ Replicating Anomalies +# ABR: Cumulative abnormal stock returns around earnings announcements + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import pickle as pkl +import sqlite3 + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +################### +# Compustat Block # +################### +comp = conn.raw_sql(""" + select gvkey, datadate, rdq, fyearq, fqtr + from comp.fundq + where indfmt = 'INDL' + and datafmt = 'STD' + and popsrc = 'D' + and consol = 'C' + and datadate >= '01/01/1959' + """) + +comp['datadate'] = pd.to_datetime(comp['datadate']) + +print('='*10, 'comp data is ready', '='*10) +################### +# CCM Block # +################### +ccm = conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where linktype in ('LU', 'LC') + """) + +ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) +ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) + +# if linkenddt is missing then set to today date +ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) + +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) +# extract month and year of rdq +ccm1['rdq'] = pd.to_datetime(ccm1['rdq']) + +# set link date bounds +ccm2 = ccm1[(ccm1['datadate']>=ccm1['linkdt']) & (ccm1['datadate']<=ccm1['linkenddt'])] +ccm2 = ccm2[['gvkey', 'datadate', 'rdq', 'fyearq', 'fqtr', 'permno']] + +################### +# CRSP Block # +################### + +# Report Date of Quarterly Earnings (rdq) may not be trading day, we need to get the first trading day on or after rdq +crsp_dsi = conn.raw_sql(""" + select distinct date + from crsp.dsi + where date >= '01/01/1959' + """) + +crsp_dsi['date'] = pd.to_datetime(crsp_dsi['date']) + +for i in range(6): # we only consider the condition that the day after rdq is not a trading day, which is up to 5 days + ccm2['trad_%s' % i] = ccm2['rdq'] + pd.DateOffset(days=i) # set rdq + i days to match trading day + crsp_dsi['trad_%s' % i] = crsp_dsi['date'] # set the merging key + crsp_dsi = crsp_dsi[['date', 'trad_%s' % i]] # reset trading day columns to avoid repeat merge + comp_temp = pd.merge(ccm2, crsp_dsi, how='left', on='trad_%s' % i) + comp_temp['trad_%s' % i] = comp_temp['date'] # reset rdq + i days to matched trading day + +# fill NA from rdq + 5 days to rdq + 0 days, then get trading day version of rdq +for i in range(5, 0, -1): + count = i-1 + comp_temp['trad_%s' % count] = np.where(comp_temp['trad_%s' % count].isnull(), + comp_temp['trad_%s' % i], comp_temp['trad_%s' % count]) + comp_temp['rdq_trad'] = comp_temp['trad_%s' % count] + +comp_temp = comp_temp[['gvkey', 'permno', 'datadate', 'fyearq', 'fqtr', 'rdq', 'rdq_trad']] + +print('='*10, 'crsp block is ready', '='*10) +############################# +# CRSP abnormal return # +############################# +crsp_d = conn.raw_sql(""" + select a.prc, a.ret, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.permno, a.permco, a.date, + b.siccd, b.ncusip, b.shrcd, b.exchcd + from crsp.dsf as a + left join crsp.dsenames as b + on a.permno=b.permno + and b.namedt<=a.date + and a.date<=b.nameendt + where a.date >= '01/01/1959' + and b.exchcd between 1 and 3 + and b.shrcd in (10,11) + """) + +# change variable format to int +crsp_d[['permco', 'permno', 'shrcd', 'exchcd']] = crsp_d[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) + +print('='*10, 'crsp abnormal return is ready', '='*10) + +# convert the date format +crsp_d['date'] = pd.to_datetime(crsp_d['date']) + +# add delisting return +dlret = conn.raw_sql(""" + select permno, dlret, dlstdt + from crsp.dsedelist + where dlstdt >= '01/01/1959' + """) + +dlret.permno = dlret.permno.astype(int) +dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) + +crsp_d = pd.merge(crsp_d, dlret, how='left', left_on=['permno', 'date'], right_on=['permno', 'dlstdt']) +# return adjusted for delisting +crsp_d['retadj'] = np.where(crsp_d['dlret'].notna(), (crsp_d['ret'] + 1)*(crsp_d['dlret'] + 1) - 1, crsp_d['ret']) +crsp_d['meq'] = crsp_d['prc'].abs()*crsp_d['shrout'] # market value of equity +crsp_d = crsp_d.sort_values(by=['date', 'permno', 'meq']) + +# sprtrn +crspsp500d = conn.raw_sql(""" + select date, sprtrn + from crsp.dsi + where date >= '01/01/1959' + """) + +crspsp500d['date'] = pd.to_datetime(crspsp500d['date']) + +# abnormal return +crsp_d = pd.merge(crsp_d, crspsp500d, how='left', on='date') +crsp_d['abrd'] = crsp_d['retadj'] - crsp_d['sprtrn'] +crsp_d = crsp_d[['date', 'permno', 'ret', 'retadj', 'sprtrn', 'abrd']] + +# date count regarding to rdq +comp_temp['minus10d'] = comp_temp['rdq_trad'] - pd.Timedelta(days=10) +comp_temp['plus5d'] = comp_temp['rdq_trad'] + pd.Timedelta(days=5) + +# df = sqldf("""select a.*, b.date, b.abrd +# from comp_temp a left join crsp_d b +# on a.permno=b.permno +# and a.minus10d<=b.date +# and b.date<=a.plus5d +# order by a.permno, a.rdq_trad, b.date;""", globals()) + +sql = sqlite3.connect(':memory:') +comp_temp.to_sql('comp_temp', sql, index=False) +crsp_d.to_sql('crsp_d', sql, index=False) + +qry = """select a.*, b.date, b.abrd + from comp_temp a left join crsp_d b + on a.permno=b.permno + and a.minus10d<=b.date + and b.date<=a.plus5d + order by a.permno, a.rdq_trad, b.date;""" +df = pd.read_sql_query(qry, sql) +df.drop(['plus5d', 'minus10d'], axis=1, inplace=True) + +# delete missing return +df = df[df['abrd'].notna()] + +# count +df.sort_values(by=['permno', 'rdq_trad', 'date'], inplace=True) +condlist = [df['date']==df['rdq_trad'], + df['date']>df['rdq_trad'], + df['date']=0] +df_after['count'] = df_after.groupby(['permno', 'rdq_trad'])['date'].cumcount() + +df = pd.concat([df_before, df_after]) + +# calculate abr as the group sum +df = df[(df['count']>=-2) & (df['count']<=1)] + +df_temp = df.groupby(['permno', 'rdq_trad'])['abrd'].sum() +df_temp = pd.DataFrame(df_temp) +df_temp.reset_index(inplace=True) +df_temp.rename(columns={'abrd': 'abr'}, inplace=True) +df = pd.merge(df, df_temp, how='left', on=['permno', 'rdq_trad'], copy=False) # add abr back to df +df = df[df['count']==1] +df.rename(columns={'date': 'rdq_plus_1d'}, inplace=True) +df = df[['gvkey', 'permno', 'datadate', 'rdq', 'rdq_plus_1d', 'abr']] + +print('='*10, 'start populate', '='*10) + +# populate the quarterly abr to monthly +crsp_msf = conn.raw_sql(""" + select distinct date + from crsp.msf + where date >= '01/01/1959' + """) + +df['datadate'] = pd.to_datetime(df['datadate']) +df['plus12m'] = df['datadate'] + np.timedelta64(12, 'M') +df['plus12m'] = df['plus12m'] + MonthEnd(0) + +# df = sqldf("""select a.*, b.date +# from df a left join crsp_msf b +# on a.rdq_plus_1d < b.date +# and a.plus12m >= b.date +# order by a.permno, b.date, a.datadate desc;""", globals()) + +df.to_sql('df', sql, index=False) +crsp_msf.to_sql('crsp_msf', sql, index=False) + +qry = """select a.*, b.date + from df a left join crsp_msf b + on a.rdq_plus_1d < b.date + and a.plus12m >= b.date + order by a.permno, b.date, a.datadate desc;""" + +df = pd.read_sql_query(qry, sql) + +df = df.drop_duplicates(['permno', 'date']) +df['datadate'] = pd.to_datetime(df['datadate']) +df['rdq'] = pd.to_datetime(df['rdq']) +df['rdq_plus_1d'] = pd.to_datetime(df['rdq_plus_1d']) +df = df[['gvkey', 'permno', 'datadate', 'rdq', 'rdq_plus_1d', 'abr', 'date']] + +with open('abr.pkl', 'wb') as f: + pkl.dump(df, f) \ No newline at end of file diff --git a/char60/accounting_60.py b/char60/accounting_60.py old mode 100644 new mode 100755 index 86dcd07..d32b43c --- a/char60/accounting_60.py +++ b/char60/accounting_60.py @@ -1,8 +1,6 @@ import pandas as pd import numpy as np -import datetime as dt import wrds -from dateutil.relativedelta import * from pandas.tseries.offsets import * import pickle as pkl from functions import * @@ -26,7 +24,7 @@ def ttm4(series, df): """ lag = pd.DataFrame() for i in range(1, 4): - lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('gvkey')['%s' % series].shift(i) + lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('permno')['%s' % series].shift(i) result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] return result @@ -46,7 +44,7 @@ def ttm12(series, df): lag['%s8' % series] + lag['%s9' % series] + lag['%s10' % series] + lag['%s11' % series] return result -print('TTM') + ####################################################################################################################### # Compustat Block # ####################################################################################################################### @@ -57,24 +55,23 @@ def ttm12(series, df): /*firm variables*/ /*income statement*/ f.sale, f.revt, f.cogs, f.xsga, f.dp, f.xrd, f.xad, f.ib, f.ebitda, - f.ebit, f.nopi, f.spi, f.pi, f.txp, f.ni, f.txfed, f.txfo, f.txt, f.xint, f.xpp, f.xacc, + f.ebit, f.nopi, f.spi, f.pi, f.txp, f.ni, f.txfed, f.txfo, f.txt, f.xint, /*CF statement and others*/ - f.capx, f.oancf, f.dvt, f.ob, f.gdwlia, f.gdwlip, f.gwo, f.mib, f.oiadp, f.ivao, f.ivst, + f.capx, f.oancf, f.dvt, f.ob, f.gdwlia, f.gdwlip, f.gwo, f.mib, f.oiadp, f.ivao, /*assets*/ f.rect, f.act, f.che, f.ppegt, f.invt, f.at, f.aco, f.intan, f.ao, f.ppent, f.gdwl, f.fatb, f.fatl, /*liabilities*/ f.lct, f.dlc, f.dltt, f.lt, f.dm, f.dcvt, f.cshrc, - f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, f.dltis, f.dltr, f.dlcch, + f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, /*equity and other*/ - f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, - f.dpc, f.ajex, f.tstkp, f.oibdp, f.capxv, f.dvpa, f.epspx, + f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, f.dpc, f.ajex, /*market*/ - abs(f.prcc_f) as prcc_f, abs(f.prcc_c) as prcc_c, f.dvc, f.prstkc, f.sstk, f.fopt, f.wcap + abs(f.prcc_f) as prcc_f from comp.funda as f left join comp.company as c @@ -122,7 +119,7 @@ def ttm12(series, df): comp['ceq'] = np.where(comp['ceq'] == 0, np.nan, comp['ceq']) comp['at'] = np.where(comp['at'] == 0, np.nan, comp['at']) comp = comp.dropna(subset=['at']) -print('compustat') + ####################################################################################################################### # CRSP Block # ####################################################################################################################### @@ -149,7 +146,7 @@ def ttm12(series, df): crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month crsp = crsp.dropna(subset=['prc']) -crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity # if Market Equity is Nan then let return equals to 0 crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) @@ -177,7 +174,7 @@ def ttm12(series, df): crsp2 = pd.merge(crsp1, crsp_summe, how='inner', on=['monthend', 'permco']) # sort by permno and date and also drop duplicates crsp2 = crsp2.sort_values(by=['permno', 'monthend']).drop_duplicates() -print('crsp') + ####################################################################################################################### # CCM Block # ####################################################################################################################### @@ -201,7 +198,7 @@ def ttm12(series, df): ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) # we can only get the accounting data after the firm public their report -# for annual data, we use 5 or 6 months lagged data +# for annual data, we use 4, 5 or 6 months lagged data ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) ccm1['jdate'] = ccm1['datadate'] + MonthEnd(4) @@ -237,21 +234,17 @@ def ttm12(series, df): data_rawa = data_rawa[data_rawa['temp'].notna()] data_rawa = data_rawa.sort_values(by=['permno', 'jdate']) -print('ccm') + ####################################################################################################################### # Annual Variables # ####################################################################################################################### -# stockholders' equity -data_rawa['se'] = np.where(data_rawa['seq'].isnull(), data_rawa['ceq']+data_rawa['pstk'], data_rawa['seq']) -data_rawa['se'] = np.where(data_rawa['se'].isnull(), data_rawa['at']-data_rawa['lt'], data_rawa['se']) - -data_rawa['txditc'] = data_rawa['txditc'].fillna(0) - # preferrerd stock data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps']) +data_rawa['txditc'] = data_rawa['txditc'].fillna(0) + # book equity data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) @@ -283,9 +276,8 @@ def ttm12(series, df): # np.nan] # data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) -# ep, checked from Hou and change 'ME' from compustat to crsp -#data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] -#data_rawa['ep_n'] = data_rawa['ib'] +# ep +# data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] # ni data_rawa['csho_l1'] = data_rawa.groupby(['permno'])['csho'].shift(1) @@ -295,7 +287,7 @@ def ttm12(series, df): np.log(data_rawa['csho']*data_rawa['ajex']).replace(-np.inf, 0)- np.log(data_rawa['csho_l1']*data_rawa['ajex_l1']).replace(-np.inf, 0)) -# op: the formula seems different from Hou Page 74? +# op data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga']) @@ -305,8 +297,6 @@ def ttm12(series, df): data_rawa['op'] = np.select(condlist, choicelist, default=(data_rawa['revt'] - data_rawa['cogs0'] - data_rawa['xsga0'] - data_rawa['xint0'])/data_rawa['be']) - - # rsup data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) # data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] @@ -317,9 +307,8 @@ def ttm12(series, df): # lev # data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] -# sp, checked +# sp # data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] -#data_rawa['sp_n'] = data_rawa['sale'] # rd_sale data_rawa['rd_sale'] = data_rawa['xrd']/data_rawa['sale'] @@ -327,7 +316,7 @@ def ttm12(series, df): # rdm # data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] -# adm hxz adm, checked +# adm hxz adm # data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] # gma @@ -369,7 +358,7 @@ def ttm12(series, df): data_rawa['txt_l1'] = data_rawa.groupby(['permno'])['txt'].shift(1) data_rawa['chtx'] = (data_rawa['txt']-data_rawa['txt_l1'])/data_rawa['at_l1'] -# noa,checked +# noa data_rawa['noa'] = ((data_rawa['at']-data_rawa['che']-data_rawa['ivao'].fillna(0))- (data_rawa['at']-data_rawa['dlc'].fillna(0)-data_rawa['dltt'].fillna(0)-data_rawa['mib'].fillna(0) -data_rawa['pstk'].fillna(0)-data_rawa['ceq'])/data_rawa['at_l1']) @@ -565,6 +554,8 @@ def ttm12(series, df): data_rawa['chpm'] = (data_rawa['ib']/data_rawa['sale'])-(data_rawa['ib_l1']/data_rawa['sale_l1']) # ala +data_rawa['gdwl'] = np.where(data_rawa['gdwl'].isnull(), 0, data_rawa['gdwl']) +data_rawa['intan'] = np.where(data_rawa['intan'].isnull(), 0, data_rawa['intan']) data_rawa['ala'] = data_rawa['che']+0.75*(data_rawa['act']-data_rawa['che'])-\ 0.5*(data_rawa['at']-data_rawa['act']-data_rawa['gdwl']-data_rawa['intan']) @@ -589,285 +580,6 @@ def ttm12(series, df): data_rawa = data_rawa.drop(['herf'], axis=1) data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) -################################## Added on 2020.10.29 ################################## -# Bmj -data_rawa['be_per'] = data_rawa['be'] / data_rawa['csho'] -data_rawa['bmj'] = data_rawa['be_per'] / data_rawa['prc'] -############### *Q*: used prc as share price from crsp ########## - -# Cp -data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] -#data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'] - -# Dp -###### *Q* difference return with without divident - -# Dur -# me = data_rawa['me_comp'] - - -# Ebp -data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) -data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) -data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] -data_rawa['f_asse'] = data_rawa['che'] -# net debt : = financial liabilities - financial assets. -data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] -data_rawa['be'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa'] -#data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['be']) / (data_rawa['n_debt']+data_rawa['me']) - - -# Em -#data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che'] -#data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'] - -############### Investment ############### -# Aci -data_rawa['ce'] = data_rawa['capx'] / data_rawa['sale'] -data_rawa['ce1'] = data_rawa['ce'].shift(1) -data_rawa['ce2'] = data_rawa['ce'].shift(2) -data_rawa['ce3'] = data_rawa['ce'].shift(3) -data_rawa['aci'] = data_rawa['ce']/ (data_rawa['ce1']+data_rawa['ce2']+data_rawa['ce3'])-1 - -# Cei -#data_rawa['lg_me'] = np.log(data_rawa['me']/data_rawa['me'].shift(6)) -#data_rawa['lg_ret'] = np.log(data_rawa['ret']*data_rawa['ret'].shift(1)*data_rawa['ret'].shift(2)*data_rawa['ret'].shift(3)*data_rawa['ret'].shift(5)*data_rawa['ret'].shift(6)) -#data_rawa['cei'] = data_rawa['lg_me'] - data_rawa['lg_ret'] - - -# Dac - - - -# dCoa -data_rawa['coa'] = data_rawa['act'] - data_rawa['che'] -data_rawa['dcoa'] = (data_rawa['coa']-data_rawa['coa'].shift(1)) / data_rawa['at'].shift(1) - - -# dBe -data_rawa['dBe'] = (data_rawa['ceq'] - data_rawa['ceq'].shift(1)) / data_rawa['at'].shift(1) - - -# dFnl & dFin -data_rawa['fna'] = data_rawa['ivst'] + data_rawa['ivao'] -data_rawa['fnl'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] - -data_rawa['d_dlc'] = data_rawa['dlc'] - data_rawa['dlc'].shift(1) -data_rawa['d_dlc'] = np.where(data_rawa['d_dlc'].isnull(), 0, data_rawa['d_dlc']) -data_rawa['d_pstk'] = data_rawa['pstk'] - data_rawa['pstk'].shift(1) -data_rawa['d_pstk'] = np.where(data_rawa['d_pstk'].isnull(), 0, data_rawa['d_pstk']) - -data_rawa['dfnl'] = (data_rawa['dltt']-data_rawa['dltt'].shift(1)) + data_rawa['d_dlc'] + data_rawa['d_pstk'] - -data_rawa['d_ivst'] = data_rawa['ivst'] - data_rawa['ivst'].shift(1) -data_rawa['d_ivst'] = np.where(data_rawa['d_ivst'].isnull(), 0, data_rawa['d_ivst']) -data_rawa['d_ivao'] = data_rawa['ivao'] - data_rawa['ivao'].shift(1) -data_rawa['d_ivao'] = np.where(data_rawa['d_ivao'].isnull(), 0, data_rawa['d_ivao']) - -data_rawa['dfna'] = data_rawa['d_ivst'] + data_rawa['d_ivao'] -data_rawa['dfin'] = data_rawa['dfna'] - data_rawa['dfnl'] - -data_rawa['dfin'] = data_rawa['dfin'] / data_rawa['at'].shift(1) -data_rawa['dfnl'] = data_rawa['dfnl'] / data_rawa['at'].shift(1) - - - - -# dIi -data_rawa['e_invt'] = (data_rawa['capxv'] + data_rawa['capxv'].shift(1))/2 -data_rawa['dinvt'] = (data_rawa['capxv'] - data_rawa['e_invt']) / data_rawa['e_invt'] - -data_rawa['ind'] = data_rawa['capxv'] -s = data_rawa.groupby(['jdate', 'sic2'])['ind'].sum() -data_rawa = pd.merge(data_rawa, s, on=['jdate', 'sic2']) -# new industry investment will be named as ind_y, cause it's been grouped by ind -data_rawa['e_ind'] = (data_rawa['ind_y'] + data_rawa['ind_y'].shift(1))/2 -data_rawa['dind'] = (data_rawa['ind_y']-data_rawa['e_ind']) / data_rawa['e_ind'] -data_rawa['dIi'] = data_rawa['dinvt'] - data_rawa['dind'] - -# dLno -data_rawa['dlno'] = (data_rawa['ppent']-data_rawa['ppent'].shift(1)) + (data_rawa['intan']-data_rawa['intan'].shift(1)) + (data_rawa['ao']-data_rawa['ao'].shift(1)) - (data_rawa['lo']-data_rawa['lo'].shift(1)) + data_rawa['dp'] -avg_at = [] -for i in range(data_rawa.shape[0]): - avg_at.append(data_rawa.loc[0:i, 'at'].mean()) -data_rawa['avg_at'] = pd.DataFrame(avg_at) -data_rawa['dlno'] = data_rawa['dlno'] / data_rawa['avg_at'] - - -# dNco -data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao'] -data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt'] -data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] -data_rawa['dnco'] = data_rawa['nco'] - data_rawa['nco'].shift(1) - - -# dNca -data_rawa['ivao_0'] = np.where(data_rawa['ivao'].isnull(), 0, data_rawa['ivao']) -data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) - -data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao_0'] -data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt_0'] -data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] -data_rawa['dnca'] = data_rawa['nco'] - data_rawa['nco'].shift(1) - - - -# dNoa -data_rawa['dlc_0'] = np.where(data_rawa['dlc'].isnull(), 0, data_rawa['dlc']) -data_rawa['mib_0'] = np.where(data_rawa['mib'].isnull(), 0, data_rawa['mib']) -data_rawa['pstk_0'] = np.where(data_rawa['pstk'].isnull(), 0, data_rawa['pstk']) - -data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] -data_rawa['op_lia'] = data_rawa['at'] - data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] -data_rawa['net_op'] = data_rawa['op_at'] - data_rawa['op_lia'] -data_rawa['dnoa'] = (data_rawa['net_op']-data_rawa['net_op'].shift(1))/ data_rawa['at'].shift(1) - - -# dPia -data_rawa['c_propty'] = data_rawa['ppegt'] - data_rawa['ppegt'].shift(1) -data_rawa['c_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) -data_rawa['dpia'] = (data_rawa['c_propty'] + data_rawa['c_invt']) / data_rawa['at'].shift(1) - - - - - -######### Profitability ########## -# Ato,repeated -#data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] - data_rawa['ivao_0'] -#data_rawa['op_lia'] = data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] -#data_rawa['noa'] = data_rawa['op_at'] - data_rawa['op_lia'] -#data_rawa['ato'] = data_rawa['sale'] / data_rawa['noa'].shift(1) - - -# Cla -data_rawa['d_rect'] = data_rawa['rect'] - data_rawa['rect'].shift(1) -data_rawa['d_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) -data_rawa['d_xpp'] = data_rawa['xpp'] - data_rawa['xpp'].shift(1) -data_rawa['d_dr'] = (data_rawa['drc']-data_rawa['drc'].shift(1)) + (data_rawa['drlt']-data_rawa['drlt'].shift(1)) -data_rawa['d_ap'] = data_rawa['ap'] - data_rawa['ap'].shift(1) -data_rawa['d_xacc'] = data_rawa['xacc'] - data_rawa['xacc'].shift(1) - -data_rawa['xrd_0'] = np.where(data_rawa['xrd'].isnull(), 0, data_rawa['xrd']) -data_rawa['d_rect_0'] = np.where(data_rawa['d_rect'].isnull(), 0, data_rawa['d_rect']) -data_rawa['d_invt_0'] = np.where(data_rawa['d_invt'].isnull(), 0, data_rawa['d_invt']) -data_rawa['d_xpp_0'] = np.where(data_rawa['d_xpp'].isnull(), 0, data_rawa['d_xpp']) -data_rawa['d_dr_0'] = np.where(data_rawa['d_dr'].isnull(), 0, data_rawa['d_dr']) -data_rawa['d_ap_0'] = np.where(data_rawa['d_ap'].isnull(), 0, data_rawa['d_ap']) -data_rawa['d_xacc_0'] = np.where(data_rawa['d_xacc'].isnull(), 0, data_rawa['d_xacc']) - -data_rawa['cla'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ - - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ - + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] -data_rawa['cla'] = data_rawa['cla'] / data_rawa['at'].shift(1) - - -# Cop -data_rawa['cop'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ - - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ - + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] -data_rawa['cop'] = data_rawa['cop'] / data_rawa['at'] - - -# Cto -data_rawa['cto'] = data_rawa['sale'] / data_rawa['at'].shift(1) - -#ir -''' -#First calculate r(t-5,t). Then rb(t-5,t) and use Bm to perform linear regression and get residue -''' -#r(t-5,t):sum ret from t-5 to t (which is calendar year t-6 to t-1) -lag = pd.DataFrame() -for i in range(1,6): - lag['ret%s' % i] = data_rawa.groupby(['permno'])['ret'].shift(i) - -data_rawa['ret5'] = lag['ret1']+lag['ret2']+lag['ret3']+lag['ret4']+lag['ret5'] - -#bm_t-5 (bm of year t-5) -#data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) - -#rB (five year log book return) -#Reference: jf_06 page8 by KENT DANIEL -#data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] - -#Regression and get ir -#First get unique datelist -#datelist = data_rawa['jdate'].unique() -#for date in datelist: -# temp = data_rawa['jdate' == date] -# n_row = temp.shape[0] -# index = temp.index -# X = pd.DataFrame() -# X['bm5'] = temp['bm5'] -# X['rB'] = temp['rB'] -# X['intercept'] = 1 -# X = X[['intercept','rB','bm5']] -# X = np.mat(X) -# Y = np.mat(temp[['ret5']]) - #These are residuals on one date -# res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) -# #put residuals back into data_rawa -# data_rawa.loc[index,'ir'] = res - -#nop -#data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] -#data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] -#data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] ) - -#ocp -#data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf']) -#data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] -#data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) - -#dwc -data_rawa['dwc'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc']) -#data_rawa['dwc'] = data_rawa['dwc']/data_rawa['at_l1'] - -#I/A -data_rawa['ia'] = (data_rawa['at']/data_rawa['at_l1'])-1 - -#Ig -data_rawa['capx_l1'] = data_rawa.groupby('permno')['capx'].shift(1) -data_rawa['ig'] = data_rawa['capx']/data_rawa['capx_l1'] - -#2Ig -data_rawa['capx_l2'] = data_rawa.groupby('permno')['capx'].shift(2) -data_rawa['2ig'] = data_rawa['capx']/data_rawa['capx_l2'] - -#Ivc -data_rawa['atAvg'] = (data_rawa['at']+data_rawa['at_l1'])/2 -data_rawa['ivc'] = data_rawa['invt'] / data_rawa['atAvg'] - -#Ndf -data_rawa['ndf'] = data_rawa['dltis'] - data_rawa['dltr'] + data_rawa['dlcch'] - -#nsi -data_rawa['sps'] = data_rawa['csho'] * data_rawa['ajex'] -data_rawa['sps_l1'] = data_rawa.groupby('permno')['sps'].shift(1) -data_rawa['nsi'] = np.log(data_rawa['sps']/data_rawa['sps_l1']) - -#oa -data_rawa['txp'] = np.where(data_rawa['txp'].isnull(), 0, data_rawa['txp']) -data_rawa['oa'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc'] - data_rawa['txp']) - data_rawa['dp'] - -#Poa -data_rawa['poa'] = data_rawa['oa']/data_rawa['ni'] - -#Ta -data_rawa['ta'] = data_rawa['dwc'] + data_rawa['dnco'] + data_rawa['dfin'] - -#Ol -data_rawa['ol'] = (data_rawa['cogs'] + data_rawa['xsga'])/data_rawa['at'] - -#etr -data_rawa['txtpi'] = data_rawa['txt'] / data_rawa['pi'] -data_rawa['txtpi_l1'] = data_rawa.groupby('permno')['txtpi'].shift(1) -data_rawa['txtpi_l2'] = data_rawa.groupby('permno')['txtpi'].shift(2) -data_rawa['txtpi_l3'] = data_rawa.groupby('permno')['txtpi'].shift(3) -data_rawa['deps'] = data_rawa['epspx']/(data_rawa['ajex'] * data_rawa['prcc_f']) -data_rawa['etr'] = (data_rawa['txtpi'] - (data_rawa['txtpi_l1'] + data_rawa['txtpi_l2'] + data_rawa['txtpi_l3'])/3) * data_rawa['deps'] - -print('annual') ####################################################################################################################### # Compustat Quarterly Raw Info # ####################################################################################################################### @@ -952,7 +664,7 @@ def ttm12(series, df): data_rawq = data_rawq[data_rawq['temp'].notna()] data_rawq = data_rawq.sort_values(by=['permno', 'jdate']) -print('quarterly raw') + ####################################################################################################################### # Quarterly Variables # ####################################################################################################################### @@ -1020,7 +732,7 @@ def ttm12(series, df): data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] -# csho +# chcsho data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1 # cashdebt @@ -1146,6 +858,8 @@ def ttm12(series, df): # data_rawq['scal'] = np.select(condlist, choicelist, default=data_rawq['seqq']) # ala +data_rawq['gdwlq'] = np.where(data_rawq['gdwlq'].isnull(), 0, data_rawq['gdwlq']) +data_rawq['intanq'] = np.where(data_rawq['intanq'].isnull(), 0, data_rawq['intanq']) data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) @@ -1278,25 +992,6 @@ def chars_std(start, end, df, chars): data_rawq = data_rawq.drop(['p_temp1', 'p_temp2', 'p_temp3', 'p_temp4', 'p_temp5', 'p_temp6', 'p_temp7', 'p_temp8', 'p_temp9'], axis=1) -################################## Added on 2020.10.29 ################################## -#Iaq -data_rawq['atqlag'] = ttm4('atq',data_rawq) -data_rawq['iaq'] = (data_rawq['atq']/data_rawq['atqlag'])-1 - -#Almq -data_rawq['intanq'] = np.where(data_rawq['intanq'].isnull(), 0, data_rawq['intanq']) -data_rawq['qal'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq']) + 0.5*(data_rawq['atq'] - data_rawq['actq'] - data_rawq['intanq']) -data_rawq['mveqa'] = data_rawq['atq'] + data_rawq['mveq_f'] - data_rawq['ceqq'] -data_rawq['mveqa_1'] = data_rawq.groupby(['permno'])['mveqa'].shift(1) -data_rawq['almq'] = data_rawq['qal']/data_rawq['mveqa_1'] - -#Olq, needs atq -data_rawq['olq'] = (data_rawq['cogsq'] + data_rawq['xsgaq'])/data_rawq['atq'] - -# rds -data_rawq['rds'] = data_rawq['xrdq4']/data_rawq['saleq'] - -print('quarterly variables') ####################################################################################################################### # Momentum # ####################################################################################################################### @@ -1307,7 +1002,6 @@ def chars_std(start, end, df, chars): """) crsp_mom['permno'] = crsp_mom['permno'].astype(int) -crsp_mom['date'] = pd.to_datetime(crsp_mom['date']) crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) crsp_mom = crsp_mom.dropna(subset=['ret', 'retx', 'prc']) @@ -1327,65 +1021,11 @@ def chars_std(start, end, df, chars): crsp_mom['ret'] = crsp_mom['ret'].fillna(0) crsp_mom['retadj'] = (1 + crsp_mom['ret']) * (1 + crsp_mom['dlret']) - 1 crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity -crsp_mom['retx'] = np.where(crsp_mom['me'].isnull(), 0, crsp_mom['retx']) -crsp_mom = crsp_mom.drop(['dlret', 'dlstdt'], axis=1)#delete prc,shrout - -#Seasonality - -#Rla -crsp_mom['rla'] = crsp_mom.groupby(['permno'])['ret'].shift(12) - -#Rln -lag = pd.DataFrame() -result = 0 -for i in range(1, 12): - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - result = result + lag['mom%s' % i] -crsp_mom['rln'] = result/11 - -#R[2,5]a -#R[2,5]n -lag = pd.DataFrame() -result = 0 -for i in range(13,61): - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - if i not in [24,36,48,60]: - result = result + lag['mom%s' % i] - -crsp_mom['r25a'] = (lag['mom24']+lag['mom36']+lag['mom48']+lag['mom60'])/4 -crsp_mom['r25n'] = result/44 - -#R[6,10]a -#R[6,10]n -lag = pd.DataFrame() -result = 0 -for i in range(61,121): - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - if i not in [72,84,96,108,120]: - result = result + lag['mom%s' % i] - -crsp_mom['r610a'] = (lag['mom72']+lag['mom84']+lag['mom96']+lag['mom108']+lag['mom120'])/5 -crsp_mom['r610n'] = result/55 - -#R[11,15]a -lag = pd.DataFrame() -result = 0 -for i in [132,144,156,168,180]: - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - result = result + lag['mom%s' % i] -crsp_mom['r1115a'] = result/5 - -#R[16,20]a -lag = pd.DataFrame() -result = 0 -for i in [192,204,216,228,240]: - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - result = result + lag['mom%s' % i] -crsp_mom['r1620a'] = result/5 def mom(start, end, df): """ + :param start: Order of starting lag :param end: Order of ending lag :param df: Dataframe @@ -1441,7 +1081,7 @@ def mom(start, end, df): # crsp_mom['moms12m'] = moms(1, 12, crsp_mom) # populate the chars to monthly -print('momentum') + # data_rawa data_rawa = data_rawa.drop(['date', 'ret', 'retx', 'me'], axis=1) data_rawa = pd.merge(crsp_mom, data_rawa, how='left', on=['permno', 'jdate']) @@ -1449,7 +1089,7 @@ def mom(start, end, df): data_rawa = data_rawa.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] -print('data_rawa') + # data_rawq data_rawq = data_rawq.drop(['date', 'ret', 'retx', 'me'], axis=1) data_rawq = pd.merge(crsp_mom, data_rawq, how='left', on=['permno', 'jdate']) @@ -1457,7 +1097,7 @@ def mom(start, end, df): data_rawq = data_rawq.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] -print('data_rawq') + ####################################################################################################################### # Monthly ME # ####################################################################################################################### @@ -1468,7 +1108,6 @@ def mom(start, end, df): # bm data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] -#data_rawa['bm_n'] = data_rawa['be'] # bm_ia df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() @@ -1489,9 +1128,8 @@ def mom(start, end, df): np.nan] data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) -# ep, checked from Hou and change 'ME' from compustat to crsp,checked +# ep data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] -#data_rawa['ep_n'] = data_rawa['ib'] # rsup # data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) @@ -1500,81 +1138,22 @@ def mom(start, end, df): # lev data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] -# sp, checked +# sp data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] -#data_rawa['sp_n'] = data_rawa['sale'] # rdm data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] -# adm hxz adm,checked +# adm hxz adm data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] # dy data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] -# Cp -#data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] -data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'] - -# Ebp -#data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) -#data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) -#data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] -#data_rawa['f_asse'] = data_rawa['che'] -# net debt : = financial liabilities - financial assets. -#data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] -#data_rawa['be'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa'] -data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['be']) / (data_rawa['n_debt']+data_rawa['me']) - -# Em -data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che'] -data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'] - -# Cei -data_rawa['lg_me'] = np.log(data_rawa['me']/data_rawa['me'].shift(6)) -data_rawa['lg_ret'] = np.log(data_rawa['ret']*data_rawa['ret'].shift(1)*data_rawa['ret'].shift(2)*data_rawa['ret'].shift(3)*data_rawa['ret'].shift(5)*data_rawa['ret'].shift(6)) -data_rawa['cei'] = data_rawa['lg_me'] - data_rawa['lg_ret'] - -#nop -data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] -data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] -data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] ) - -#ocp -data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf']) -data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] -data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) - -#bm_t-5 (bm of year t-5) -data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) - -#rB (five year log book return) -#Reference: jf_06 page8 by KENT DANIEL -data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] - -#Regression and get ir -#First get unique datelist -datelist = data_rawa['jdate'].unique() -for date in datelist: - temp = data_rawa[data_rawa['jdate'] == date] - n_row = temp.shape[0] - index = temp.index - X = pd.DataFrame() - X['bm5'] = temp['bm5'] - X['rB'] = temp['rB'] - X['intercept'] = 1 - X = X[['intercept','rB','bm5']] - X = np.mat(X) - Y = np.mat(temp[['ret5']]) - #These are residuals on one date - res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) - #put residuals back into data_rawa - data_rawa.loc[index,'ir'] = res - # Annual Accounting Variables chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', - 'sic', 'retadj', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', 'rsup', 'cash', 'chcsho', + 'sic', 'ret', 'retx', 'retadj', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', + 'rsup', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'adm', 'sgr', 'sp', 'invest', 'roe', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'chato', 'chtx', 'noa', 'rna', 'pm', 'ato', 'dy', 'roic', 'chinv', 'pchsale_pchinvt', 'pchsale_pchrect', 'pchgm_pchsale', 'pchsale_pchxsga', @@ -1582,13 +1161,9 @@ def mom(start, end, df): 'salecash', 'salerec', 'saleinv', 'pchsaleinv', 'realestate', 'obklg', 'chobklg', 'grltnoa', 'conv', 'chdrc', 'rdbias', 'operprof', 'capxint', 'xadint', 'chpm', 'ala', 'alm', 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'hire', 'herf', 'bm_ia', - 'me_ia', 'bmj','cp', 'ebp', 'em', 'dp', 'aci', 'dpia', 'dBe', 'dfnl', 'dfin', 'dcoa', - 'dlno', 'dnoa', 'cla', 'cop', 'cto', 'dIi', 'dnco', 'dnca', 'ir', 'nop', 'ocp', - 'ia', 'ig','2ig','ivc','ndf','nsi','oa','poa','ta','ol','etr']] - + 'me_ia', 'turn', 'dolvol']] chars_a.reset_index(drop=True, inplace=True) -print(chars_a) -print('ME annual') + ######################################## # Quarterly # ######################################## @@ -1623,21 +1198,18 @@ def mom(start, end, df): data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') # Quarterly Accounting Variables -chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd','retadj' ,'acc', 'bm', 'cfp', +chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd', + 'ret', 'retx', 'retadj', 'acc', 'bm', 'cfp', 'ep', 'agr', 'ni', 'op', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'sgr', 'sp', 'invest', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'roe', 'chato', 'chpm', 'chtx', 'noa', 'rna', 'pm', 'ato', 'stdcf', 'grltnoa', 'ala', 'alm', 'rsup', 'stdacc', 'sgrvol', 'roavol', 'scf', 'cinvest', 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'pscore', 'nincr', - 'turn', 'dolvol', 'iaq', 'almq', 'olq', 'rds']] - + 'turn', 'dolvol']] chars_q.reset_index(drop=True, inplace=True) -print(chars_q) -print('ME quarterly') + with open('chars_a_60.pkl', 'wb') as f: pkl.dump(chars_a, f) -print('pkl a') + with open('chars_q_60.pkl', 'wb') as f: pkl.dump(chars_q, f) -print('pkl q') -print('Finished') \ No newline at end of file diff --git a/char60/beta.py b/char60/beta.py old mode 100644 new mode 100755 diff --git a/char60/bid_ask_spread.py b/char60/bid_ask_spread.py old mode 100644 new mode 100755 diff --git a/char60/functions.py b/char60/functions.py old mode 100644 new mode 100755 index f26d678..34cd370 --- a/char60/functions.py +++ b/char60/functions.py @@ -1,6 +1,7 @@ import pandas as pd import pickle as pkl import numpy as np +from tqdm import tqdm import re def ffi49(df): @@ -335,13 +336,13 @@ def fillna_atq(df_q, df_a): na_columns_list.append(i) # get annual columns from df_a df_temp = df_a[na_columns_list].copy() - df_temp[['permno', 'jdate']] = df_a[['permno', 'jdate']].copy() + df_temp[['permno', 'date']] = df_a[['permno', 'date']].copy() # rename annual columns in the form of 'chars_a' for na_column in na_columns_list: df_temp = df_temp.rename(columns={'%s' % na_column: '%s_a' % na_column}) df_temp = df_temp.reset_index(drop=True) # use annual chars to fill quarterly na - df_q = pd.merge(df_q, df_temp, how='left', on=['permno', 'jdate']) + df_q = pd.merge(df_q, df_temp, how='left', on=['permno', 'date']) for na_column in na_columns_list: df_q['%s' % na_column] = np.where(df_q['%s' % na_column].isnull(), df_q['%s_a' % na_column], df_q['%s' % na_column]) df_q = df_q.drop(['%s_a' % na_column], axis=1) @@ -353,9 +354,9 @@ def fillna_ind(df, method, ffi): na_columns_list = df.columns[df.isna().any()].tolist() for na_column in na_columns_list: if method == 'mean': - df_temp = df.groupby(['jdate', 'ffi%s' % ffi])['%s' % na_column].mean() + df_temp = df.groupby(['date', 'ffi%s' % ffi])['%s' % na_column].mean() elif method == 'median': - df_temp = df.groupby(['jdate', 'ffi%s' % ffi])['%s' % na_column].median() + df_temp = df.groupby(['date', 'ffi%s' % ffi])['%s' % na_column].median() else: None df_fill = pd.concat([df_fill, df_temp], axis=1) @@ -366,18 +367,18 @@ def fillna_ind(df, method, ffi): else: None df_fill = df_fill.reset_index() - # reset multiple index to jdate and ffi code + # reset multiple index to date and ffi code df_fill['index'] = df_fill['index'].astype(str) index_temp = df_fill['index'].str.split(',', expand=True) - index_temp.columns = ['jdate', 'ffi%s' % ffi] - index_temp['jdate'] = index_temp['jdate'].str.strip('(Timestamp(\' \')') + index_temp.columns = ['date', 'ffi%s' % ffi] + index_temp['date'] = index_temp['date'].str.strip('(Timestamp(\' \')') index_temp['ffi%s' % ffi] = index_temp['ffi%s' % ffi].str.strip(')') - df_fill[['jdate', 'ffi%s' % ffi]] = index_temp[['jdate', 'ffi%s' % ffi]] + df_fill[['date', 'ffi%s' % ffi]] = index_temp[['date', 'ffi%s' % ffi]] df_fill = df_fill.drop(['index'], axis=1) - df_fill['jdate'] = pd.to_datetime(df_fill['jdate']) + df_fill['date'] = pd.to_datetime(df_fill['date']) df_fill['ffi49'] = df_fill['ffi49'].astype(int) # fill na - df = pd.merge(df, df_fill, how='left', on=['jdate', 'ffi%s' % ffi]) + df = pd.merge(df, df_fill, how='left', on=['date', 'ffi%s' % ffi]) for na_column in na_columns_list: if method == 'mean': df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column]) @@ -395,9 +396,9 @@ def fillna_all(df, method): na_columns_list = df.columns[df.isna().any()].tolist() for na_column in na_columns_list: if method == 'mean': - df_temp = df.groupby(['jdate'])['%s' % na_column].mean() + df_temp = df.groupby(['date'])['%s' % na_column].mean() elif method == 'median': - df_temp = df.groupby(['jdate'])['%s' % na_column].median() + df_temp = df.groupby(['date'])['%s' % na_column].median() else: None df_fill = pd.concat([df_fill, df_temp], axis=1) @@ -408,16 +409,16 @@ def fillna_all(df, method): else: None df_fill = df_fill.reset_index() - # reset multiple index to jdate and ffi code + # reset multiple index to date and ffi code df_fill['index'] = df_fill['index'].astype(str) index_temp = df_fill['index'].str.split(',', expand=True) - index_temp.columns = ['jdate'] - index_temp['jdate'] = index_temp['jdate'].str.strip('(Timestamp(\' \')') - df_fill[['jdate']] = index_temp[['jdate']] + index_temp.columns = ['date'] + index_temp['date'] = index_temp['date'].str.strip('(Timestamp(\' \')') + df_fill[['date']] = index_temp[['date']] df_fill = df_fill.drop(['index'], axis=1) - df_fill['jdate'] = pd.to_datetime(df_fill['jdate']) + df_fill['date'] = pd.to_datetime(df_fill['date']) # fill na - df = pd.merge(df, df_fill, how='left', on='jdate') + df = pd.merge(df, df_fill, how='left', on='date') for na_column in na_columns_list: if method == 'mean': df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column]) @@ -431,15 +432,21 @@ def fillna_all(df, method): def standardize(df): - df_temp = df.groupby(['jdate'], as_index=False)['gvkey'].count() - df_temp = df_temp.rename(columns={'gvkey': 'count'}) - df = pd.merge(df, df_temp, how='left', on='jdate') + # exclude the the information columns col_names = df.columns.values.tolist() - list_to_remove = ['permno', 'date', 'jdate', 'datadate', 'gvkey', 'sic', 'count', 'exchcd', 'shrcd'] + list_to_remove = ['permno', 'date', 'date', 'datadate', 'gvkey', 'sic', 'count', 'exchcd', 'shrcd', 'ffi49', 'ret', + 'retadj', 'retx', 'lag_me'] col_names = list(set(col_names).difference(set(list_to_remove))) - for col_name in col_names: - df['%s_rank' % col_name] = df.groupby(['jdate'])['%s' % col_name].rank() - df['rank_%s' % col_name] = (df['%s_rank' % col_name]-1)/(df['count']-1)*2 - 1 - df = df.drop(['%s_rank' % col_name, '%s' % col_name], axis=1) + for col_name in tqdm(col_names): + print('processing %s' % col_name) + # count the non-missing number of factors, we only count non-missing values + unique_count = df.dropna(subset=['%s' % col_name]).groupby(['date'])['%s' % col_name].unique().apply(len) + unique_count = pd.DataFrame(unique_count).reset_index() + unique_count.columns = ['date', 'count'] + df = pd.merge(df, unique_count, how='left', on=['date']) + # ranking, and then standardize the data + df['%s_rank' % col_name] = df.groupby(['date'])['%s' % col_name].rank(method='dense') + df['rank_%s' % col_name] = (df['%s_rank' % col_name] - 1) / (df['count'] - 1) * 2 - 1 + df = df.drop(['%s_rank' % col_name, '%s' % col_name, 'count'], axis=1) df = df.fillna(0) - return df \ No newline at end of file + return df diff --git a/py-iclink/iclink.py b/char60/iclink.py old mode 100644 new mode 100755 similarity index 99% rename from py-iclink/iclink.py rename to char60/iclink.py index 311c76f..c630697 --- a/py-iclink/iclink.py +++ b/char60/iclink.py @@ -2,7 +2,6 @@ import numpy as np import datetime as dt import wrds -import psycopg2 from dateutil.relativedelta import * from pandas.tseries.offsets import * from pandasql import * diff --git a/char60/ill.py b/char60/ill.py old mode 100644 new mode 100755 diff --git a/char60/impute_rank_output_bchmk_60.py b/char60/impute_rank_output_bchmk_60.py old mode 100644 new mode 100755 index b5dbe11..dd7a242 --- a/char60/impute_rank_output_bchmk_60.py +++ b/char60/impute_rank_output_bchmk_60.py @@ -1,14 +1,21 @@ import pandas as pd import pickle as pkl import numpy as np -import wrds +from tqdm import tqdm from functions import * #################### # All Stocks # #################### +with open('chars_q_raw.pkl', 'rb') as f: + chars_q = pkl.load(f) -with open('chars_a.pkl', 'rb') as f: +chars_q = chars_q.dropna(subset=['permno']) +chars_q[['permno', 'gvkey']] = chars_q[['permno', 'gvkey']].astype(int) +chars_q['jdate'] = pd.to_datetime(chars_q['jdate']) +chars_q = chars_q.drop_duplicates(['permno', 'jdate']) + +with open('chars_a_raw.pkl', 'rb') as f: chars_a = pkl.load(f) chars_a = chars_a.dropna(subset=['permno']) @@ -16,65 +23,142 @@ chars_a['jdate'] = pd.to_datetime(chars_a['jdate']) chars_a = chars_a.drop_duplicates(['permno', 'jdate']) -with open('chars_q_raw.pkl', 'rb') as f: - chars_q = pkl.load(f) +# information list +obs_var_list = ['gvkey', 'permno', 'jdate', 'sic', 'ret', 'retx', 'retadj', 'exchcd', 'shrcd'] +# characteristics with quarterly and annual frequency at the same time +accounting_var_list = ['datadate', 'acc', 'bm', 'agr', 'alm', 'ato', 'cash', 'cashdebt', 'cfp', 'chcsho', 'chpm', + 'chtx', 'depr', 'ep', 'gma', 'grltnoa', 'lev', 'lgr', 'ni', 'noa', 'op', 'pctacc', 'pm', + 'rd_sale', 'rdm', 'rna', 'roa', 'roe', 'rsup', 'sgr', 'sp'] +a_var_list = ['a_'+i for i in accounting_var_list] +q_var_list = ['q_'+i for i in accounting_var_list] +# annual frequency only list +a_only_list = ['adm', 'bm_ia', 'herf', 'hire', 'me_ia'] +# quarterly frequency only list +q_only_list = ['abr', 'sue', 'cinvest', 'nincr', 'pscore', + # 'turn', 'dolvol' + ] +# monthly frequency only list +m_var_list = ['baspread', 'beta', 'ill', 'maxret', 'mom12m', 'mom1m', 'mom36m', 'mom60m', 'mom6m', 're', 'rvar_capm', + 'rvar_ff3', 'rvar_mean', 'seas1a', 'std_dolvol', 'std_turn', 'zerotrade', 'me', 'dy', + 'turn', 'dolvol' # need to rerun the accounting to put them in to char_a + ] + +df_a = chars_a[obs_var_list + accounting_var_list + a_only_list + m_var_list] +df_a.columns = obs_var_list + a_var_list + a_only_list + m_var_list +df_a = df_a.sort_values(obs_var_list) + +df_q = chars_q[obs_var_list + accounting_var_list + q_only_list] +df_q.columns = obs_var_list + q_var_list + q_only_list +# drop the same information columns for merging +df_q = df_q.drop(['sic', 'ret', 'retx', 'retadj', 'exchcd', 'shrcd'], axis=1) + +df = df_a.merge(df_q, how='left', on=['gvkey', 'jdate', 'permno']) + +# first element in accounting_var_list is datadate +for i in tqdm(accounting_var_list[1:]): + print('processing %s' % i) + a = 'a_'+i + q = 'q_'+i + t1 = 'tmp1_'+i + t2 = 'tmp2_'+i + t3 = 'tmp3_'+i + t4 = 'tmp4_'+i + t5 = 'tmp5_'+i + + # tmp1: if the annual variable is available + df[t1] = np.where(df[a].isna(), False, True) + # tmp2: if the quarterly variable is available + df[t2] = np.where(df[q].isna(), False, True) + # tmp3: both + df[t3] = df[t1] & df[t2] + # tmp4: latest one + df[t4] = np.where(df['q_datadate'] < df['a_datadate'], df[a], df[q]) + # available one + df[t5] = np.where(df[t1], df[a], df[q]) + # final + df[i] = np.where(df[t3], df[t4], df[t5]) + df = df.drop([a, q, t1, t2, t3, t4, t5], axis=1) + +# drop the datadate of different frequency +df = df.drop(['a_datadate', 'q_datadate'], axis=1) + +# drop optional variables, you can adjust it by your selection +df = df.drop(['ret', 'retx'], axis=1) +df = df.rename(columns={'retadj': 'ret'}) # retadj is return adjusted by dividend +df['ret'] = df.groupby(['permno'])['ret'].shift(-1) # we shift return in t period to t+1 for prediction +df['date'] = df.groupby(['permno'])['jdate'].shift(-1) # date is return date, jdate is predictor date +df = df.drop(['jdate'], axis=1) # now we only keep the date of return +df = df.dropna(subset=['ret']).reset_index(drop=True) + +# save raw data +with open('chars60_raw_no_impute.pkl', 'wb') as f: + pkl.dump(df, f, protocol=4) + +# impute missing values, you can choose different func form functions.py, such as ffi49/ffi10 +df_impute = df.copy() +df_impute['sic'] = df_impute['sic'].astype(int) +df_impute['date'] = pd.to_datetime(df_impute['date']) + +df_impute['ffi49'] = ffi49(df_impute) +df_impute['ffi49'] = df_impute['ffi49'].fillna(49) # we treat na in ffi49 as 'other' +df_impute['ffi49'] = df_impute['ffi49'].astype(int) -# use annual variables to fill na of quarterly variables -chars_q = fillna_atq(df_q=chars_q, df_a=chars_a) +# there are two ways to impute: industrial median or mean +df_impute = fillna_ind(df_impute, method='median', ffi=49) -# adm is annual variable -adm = chars_a[['permno', 'jdate', 'adm']] -chars_q = pd.merge(chars_q, adm, how='left', on=['permno', 'jdate']) +df_impute = fillna_all(df_impute, method='median') +df_impute['re'] = df_impute['re'].fillna(0) # re use IBES database, there are lots of missing data -# impute missing values, you can choose different func form functions, such as ffi49/ffi10 -chars_q_impute = chars_q.copy() -chars_q_impute['sic'] = chars_q_impute['sic'].astype(int) -chars_q_impute['jdate'] = pd.to_datetime(chars_q_impute['jdate']) +df_impute['year'] = df_impute['date'].dt.year +df_impute = df_impute[df_impute['year'] >= 1972] +df_impute = df_impute.drop(['year'], axis=1) -chars_q_impute['ffi49'] = ffi49(chars_q_impute) -chars_q_impute['ffi49'] = chars_q_impute['ffi49'].fillna(49) # we treat na in ffi49 as 'other' -chars_q_impute['ffi49'] = chars_q_impute['ffi49'].astype(int) +with open('chars60_raw_imputed.pkl', 'wb') as f: + pkl.dump(df_impute, f, protocol=4) -# there are two ways to impute: industrial median or mean -chars_q_impute = fillna_ind(chars_q_impute, method='median', ffi=49) -# we use all stocks' mean or median to fill na that are not filled by value of ffi -chars_q_impute = fillna_all(chars_q_impute, method='median') -chars_q_impute['re'] = chars_q_impute['re'].fillna(0) # re use IBES database, there are lots of missing data +# standardize raw data +df_rank = df.copy() +df_rank['lag_me'] = df_rank['me'] +df_rank = standardize(df_rank) +df_rank['year'] = df_rank['date'].dt.year +df_rank = df_rank[df_rank['year'] >= 1972] +df_rank = df_rank.drop(['year'], axis=1) +df_rank['log_me'] = np.log(df_rank['lag_me']) -chars_q_impute['year'] = chars_q_impute['jdate'].dt.year -chars_q_impute = chars_q_impute[chars_q_impute['year'] >= 1972] -chars_q_impute = chars_q_impute.drop(['year'], axis=1) +with open('chars60_rank_no_impute.pkl', 'wb') as f: + pkl.dump(df_rank, f, protocol=4) -with open('chars_impute.pkl', 'wb') as f: - pkl.dump(chars_q_impute, f, protocol=4) +# standardize imputed data +df_rank = df_impute.copy() +df_rank['lag_me'] = df_rank['me'] +df_rank = standardize(df_rank) +df_rank['year'] = df_rank['date'].dt.year +df_rank = df_rank[df_rank['year'] >= 1972] +df_rank = df_rank.drop(['year'], axis=1) +df_rank['log_me'] = np.log(df_rank['lag_me']) -# standardize characteristics -chars_q_rank = standardize(chars_q) -chars_q_rank['year'] = chars_q_rank['jdate'].dt.year -chars_q_rank = chars_q_rank[chars_q_rank['year'] >= 1972] -chars_q_rank = chars_q_rank.drop(['year'], axis=1) +with open('chars60_rank_imputed.pkl', 'wb') as f: + pkl.dump(df_rank, f, protocol=4) -with open('chars_rank.pkl', 'wb') as f: - pkl.dump(chars_q_rank, f, protocol=4) #################### # SP1500 # #################### -with open('sp1500_impute_benchmark.pkl', 'rb') as f: +with open('/home/jianxinma/chars/data/sp1500_impute_benchmark.pkl', 'rb') as f: sp1500_index = pkl.load(f) -sp1500_index = sp1500_index[['gvkey', 'jdate']] +sp1500_index = sp1500_index[['gvkey', 'date']] -sp1500_impute = pd.merge(sp1500_index, chars_q_impute, how='left', on=['gvkey', 'jdate']) +sp1500_impute = pd.merge(sp1500_index, df_impute, how='left', on=['gvkey', 'date']) # for test # test = sp1500_rank.groupby(['jdate'])['gvkey'].nunique() -with open('sp1500_impute.pkl', 'wb') as f: +with open('sp1500_impute_60.pkl', 'wb') as f: pkl.dump(sp1500_impute, f, protocol=4) # standardize characteristics -sp1500_rank = pd.merge(sp1500_index, chars_q_rank, how='left', on=['gvkey', 'jdate']) +sp1500_rank = pd.merge(sp1500_index, df_rank, how='left', on=['gvkey', 'date']) -with open('sp1500_rank.pkl', 'wb') as f: - pkl.dump(sp1500_rank, f, protocol=4) \ No newline at end of file +with open('sp1500_rank_60.pkl', 'wb') as f: + pkl.dump(sp1500_rank, f, protocol=4) diff --git a/char60/maxret_d.py b/char60/maxret_d.py old mode 100644 new mode 100755 diff --git a/char60/merge_chars_60.py b/char60/merge_chars_60.py old mode 100644 new mode 100755 index b596b70..763e9e6 --- a/char60/merge_chars_60.py +++ b/char60/merge_chars_60.py @@ -1,7 +1,156 @@ +# Since some firms only have annual recording before 80s, we need to use annual data as merging benchmark in case +# there are some recordings are missing + import pandas as pd import pickle as pkl from pandas.tseries.offsets import * +with open('chars_a_60.pkl', 'rb') as f: + chars_a = pkl.load(f) + +chars_a = chars_a.dropna(subset=['permno']) +chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int) +chars_a['jdate'] = pd.to_datetime(chars_a['jdate']) +chars_a = chars_a.drop_duplicates(['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/beta.pkl', 'rb') as f: + beta = pkl.load(f) + +beta['permno'] = beta['permno'].astype(int) +beta['jdate'] = pd.to_datetime(beta['date']) + MonthEnd(0) +beta = beta[['permno', 'jdate', 'beta']] +beta = beta.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, beta, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/rvar_capm.pkl', 'rb') as f: + rvar_capm = pkl.load(f) + +rvar_capm['permno'] = rvar_capm['permno'].astype(int) +rvar_capm['jdate'] = pd.to_datetime(rvar_capm['date']) + MonthEnd(0) +rvar_capm = rvar_capm[['permno', 'jdate', 'rvar_capm']] +rvar_capm = rvar_capm.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, rvar_capm, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/rvar_mean.pkl', 'rb') as f: + rvar_mean = pkl.load(f) + +rvar_mean['permno'] = rvar_mean['permno'].astype(int) +rvar_mean['jdate'] = pd.to_datetime(rvar_mean['date']) + MonthEnd(0) +rvar_mean = rvar_mean[['permno', 'jdate', 'rvar_mean']] +rvar_mean = rvar_mean.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, rvar_mean, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/rvar_ff3.pkl', 'rb') as f: + rvar_ff3 = pkl.load(f) + +rvar_ff3['permno'] = rvar_ff3['permno'].astype(int) +rvar_ff3['jdate'] = pd.to_datetime(rvar_ff3['date']) + MonthEnd(0) +rvar_ff3 = rvar_ff3[['permno', 'jdate', 'rvar_ff3']] +rvar_ff3 = rvar_ff3.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, rvar_ff3, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/sue.pkl', 'rb') as f: + sue = pkl.load(f) + +sue['permno'] = sue['permno'].astype(int) +sue['jdate'] = pd.to_datetime(sue['date']) + MonthEnd(0) +sue = sue[['permno', 'jdate', 'sue']] +sue = sue.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, sue, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/re.pkl', 'rb') as f: + re = pkl.load(f) + +re['permno'] = re['permno'].astype(int) +re['jdate'] = pd.to_datetime(re['date']) + MonthEnd(0) +re = re[['permno', 'jdate', 're']] +re = re.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, re, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/abr.pkl', 'rb') as f: + abr = pkl.load(f) + +abr['permno'] = abr['permno'].astype(int) +abr['jdate'] = pd.to_datetime(abr['date']) + MonthEnd(0) +abr = abr[['permno', 'jdate', 'abr']] +abr = abr.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, abr, how='left', on=['permno', 'jdate']) + +with open('baspread.pkl', 'rb') as f: + baspread = pkl.load(f) + +baspread['permno'] = baspread['permno'].astype(int) +baspread['jdate'] = pd.to_datetime(baspread['date']) + MonthEnd(0) +baspread = baspread[['permno', 'jdate', 'baspread']] +baspread = baspread.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, baspread, how='left', on=['permno', 'jdate']) + +with open('maxret.pkl', 'rb') as f: + maxret = pkl.load(f) + +maxret['permno'] = maxret['permno'].astype(int) +maxret['jdate'] = pd.to_datetime(maxret['date']) + MonthEnd(0) +maxret = maxret[['permno', 'jdate', 'maxret']] +maxret = maxret.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, maxret, how='left', on=['permno', 'jdate']) + +with open('std_dolvol.pkl', 'rb') as f: + std_dolvol = pkl.load(f) + +std_dolvol['permno'] = std_dolvol['permno'].astype(int) +std_dolvol['jdate'] = pd.to_datetime(std_dolvol['date']) + MonthEnd(0) +std_dolvol = std_dolvol[['permno', 'jdate', 'std_dolvol']] +std_dolvol = std_dolvol.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, std_dolvol, how='left', on=['permno', 'jdate']) + +with open('ill.pkl', 'rb') as f: + ill = pkl.load(f) + +ill['permno'] = ill['permno'].astype(int) +ill['jdate'] = pd.to_datetime(ill['date']) + MonthEnd(0) +ill = ill[['permno', 'jdate', 'ill']] +ill = ill.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, ill, how='left', on=['permno', 'jdate']) + +with open('std_turn.pkl', 'rb') as f: + std_turn = pkl.load(f) + +std_turn['permno'] = std_turn['permno'].astype(int) +std_turn['jdate'] = pd.to_datetime(std_turn['date']) + MonthEnd(0) +std_turn = std_turn[['permno', 'jdate', 'std_turn']] +std_turn = std_turn.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, std_turn, how='left', on=['permno', 'jdate']) + +with open('zerotrade.pkl', 'rb') as f: + zerotrade = pkl.load(f) + +zerotrade['permno'] = zerotrade['permno'].astype(int) +zerotrade['jdate'] = pd.to_datetime(zerotrade['date']) + MonthEnd(0) +zerotrade = zerotrade[['permno', 'jdate', 'zerotrade']] +zerotrade = zerotrade.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, zerotrade, how='left', on=['permno', 'jdate']) + +# save data +with open('chars_a_raw.pkl', 'wb') as f: + pkl.dump(chars_a, f, protocol=4) + +######################################################################################################################## +# In order to keep the naming tidy, we need to make another chars_q_raw, which is just a temporary dataframe # +######################################################################################################################## + with open('chars_q_60.pkl', 'rb') as f: chars_q = pkl.load(f) diff --git a/char60/pkl_to_csv.py b/char60/pkl_to_csv.py new file mode 100755 index 0000000..74cefea --- /dev/null +++ b/char60/pkl_to_csv.py @@ -0,0 +1,29 @@ +import pickle as pkl +import pandas as pd + +with open('/Users/eric/Downloads/chars_rank_60.pkl', 'rb') as f: + chars = pkl.load(f) + +print(chars.columns.values) + +chars['jdate'] = pd.to_datetime(chars['jdate']) +chars['year'] = chars['jdate'].dt.year +chars_1970s = chars[chars['year'] < 1980] +chars_1980s = chars[(chars['year'] >= 1980) & (chars['year'] < 1990)] +chars_1990s = chars[(chars['year'] >= 1990) & (chars['year'] < 2000)] +chars_2000s = chars[(chars['year'] >= 1990) & (chars['year'] < 2010)] +chars_2010s = chars[(chars['year'] >= 2000) & (chars['year'] < 2020)] + +# raw +# chars_1970s.to_csv('chars60_raw_1970s.csv', index=0) +# chars_1980s.to_csv('chars60_raw_1980s.csv', index=0) +# chars_1990s.to_csv('chars60_raw_1990s.csv', index=0) +# chars_2000s.to_csv('chars60_raw_2000s.csv', index=0) +# chars_2010s.to_csv('chars60_raw_2010s.csv', index=0) + +# rank +chars_1970s.to_csv('chars60_rank_1970s.csv', index=0) +chars_1980s.to_csv('chars60_rank_1980s.csv', index=0) +chars_1990s.to_csv('chars60_rank_1990s.csv', index=0) +chars_2000s.to_csv('chars60_rank_2000s.csv', index=0) +chars_2010s.to_csv('chars60_rank_2010s.csv', index=0) \ No newline at end of file diff --git a/char60/re.py b/char60/re.py new file mode 100755 index 0000000..7dab02f --- /dev/null +++ b/char60/re.py @@ -0,0 +1,120 @@ +# Calculate HSZ Replicating Anomalies +# RE: Revisions in analysts’ earnings forecasts + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +from pandasql import * +import pickle as pkl + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +######################################################################### +# Merging IBES and CRSP by using ICLINK table. Merging last month price # +######################################################################### + +with open('iclink.pkl', 'rb')as f: + iclink = pkl.load(f) + +ibes = conn.raw_sql(""" + select + ticker, statpers, meanest, fpedats, anndats_act, curr_act, fpi, medest + from ibes.statsum_epsus + where + /* filtering IBES */ + statpers=0 + and CURCODE='USD' + and fpi in ('1','2')""") + +# filtering IBES +ibes = ibes[(ibes['medest'].notna()) & (ibes['fpedats'].notna())] +ibes = ibes[(ibes['curr_act']=='USD') | (ibes['curr_act'].isnull())] +ibes['statpers'] = pd.to_datetime(ibes['statpers']) +ibes['merge_date'] = ibes['statpers']+MonthEnd(0) + +crsp_msf = conn.raw_sql(""" + select permno, date, prc, cfacpr + from crsp.msf + """) + +crsp_msf['date'] = pd.to_datetime(crsp_msf['date']) +crsp_msf['date'] = crsp_msf['date']+MonthEnd(0) +crsp_msf['merge_date'] = crsp_msf['date']+MonthEnd(1) + +ibes_iclink = pd.merge(ibes, iclink, how='left', on='ticker') +ibes_crsp = pd.merge(ibes_iclink, crsp_msf, how='inner', on=['permno', 'merge_date']) +ibes_crsp.sort_values(by=['ticker', 'fpedats', 'statpers'], inplace=True) +ibes_crsp.reset_index(inplace=True, drop=True) + +############################### +# Merging last month forecast # +############################### +ibes_crsp['statpers_last_month'] = np.where((ibes_crsp['ticker'] == ibes_crsp['ticker'].shift(1)) & + (ibes_crsp['permno'] == ibes_crsp['permno'].shift(1)) & + (ibes_crsp['fpedats'] == ibes_crsp['fpedats'].shift(1)), + ibes_crsp['statpers'].shift(1).astype(str), np.nan) + +ibes_crsp['meanest_last_month'] = np.where((ibes_crsp['ticker'] == ibes_crsp['ticker'].shift(1)) & + (ibes_crsp['permno'] == ibes_crsp['permno'].shift(1)) & + (ibes_crsp['fpedats'] == ibes_crsp['fpedats'].shift(1)), + ibes_crsp['meanest'].shift(1), np.nan) + +ibes_crsp.sort_values(by=['ticker', 'permno', 'fpedats', 'statpers'], inplace=True) +ibes_crsp.reset_index(inplace=True, drop=True) + +########################### +# Drop empty "last month" # +# Calculate HXZ RE # +########################### + +ibes_crsp = ibes_crsp[ibes_crsp['statpers_last_month'].notna()] +ibes_crsp['prc_adj'] = ibes_crsp['prc']/ibes_crsp['cfacpr'] +ibes_crsp = ibes_crsp[ibes_crsp['prc_adj']>0] +ibes_crsp['monthly_revision'] = (ibes_crsp['meanest'] - ibes_crsp['meanest_last_month'])/ibes_crsp['prc_adj'] + +ibes_crsp['permno'] = ibes_crsp['permno'].astype(int) +ibes_crsp['permno'] = ibes_crsp['permno'].astype(str) +ibes_crsp['fpedats'] = ibes_crsp['fpedats'].astype(str) +ibes_crsp['permno_fpedats'] = ibes_crsp['permno'].str.cat(ibes_crsp['fpedats'], sep='-') + +ibes_crsp = ibes_crsp.drop_duplicates(['permno_fpedats', 'statpers']) +ibes_crsp['count'] = ibes_crsp.groupby('permno_fpedats').cumcount() + 1 + +######################## +# Calculate RE (CJL) # +######################## + +ibes_crsp['monthly_revision_l1'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(1) +ibes_crsp['monthly_revision_l2'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(2) +ibes_crsp['monthly_revision_l3'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(3) +ibes_crsp['monthly_revision_l4'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(4) +ibes_crsp['monthly_revision_l5'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(5) +ibes_crsp['monthly_revision_l6'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(6) + +condlist = [ibes_crsp['count']==4, + ibes_crsp['count']==5, + ibes_crsp['count']==6, + ibes_crsp['count']>=7] +choicelist = [(ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'])/3, + (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'])/4, + (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'] + ibes_crsp['monthly_revision_l5'])/5, + (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'] + ibes_crsp['monthly_revision_l5'] + ibes_crsp['monthly_revision_l6'])/6] +ibes_crsp['re'] = np.select(condlist, choicelist, default=np.nan) + +ibes_crsp = ibes_crsp[ibes_crsp['count']>=4] +ibes_crsp = ibes_crsp.sort_values(by=['ticker', 'statpers', 'fpedats']) +ibes_crsp = ibes_crsp.drop_duplicates(['ticker', 'statpers']) + +ibes_crsp = ibes_crsp[['ticker', 'statpers', 'fpedats', 'anndats_act', 'curr_act', 'permno', 're']] +ibes_crsp.rename(columns={'statpers': 'date'}, inplace=True) + +with open('re.pkl', 'wb') as f: + pkl.dump(ibes_crsp, f) \ No newline at end of file diff --git a/char60/rvar_capm.py b/char60/rvar_capm.py new file mode 100755 index 0000000..fa3a01c --- /dev/null +++ b/char60/rvar_capm.py @@ -0,0 +1,168 @@ +# CAPM residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf + from crsp.dsf as a + left join ff.factors_daily as b + on a.date=b.date + where a.date >= '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate residual # +###################### + + +def get_res_var(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + # if observations in last 3 months are less 21, we drop the rvar of this month + if temp['permno'].count() < 21: + pass + else: + rolling_window = temp['permno'].count() + index = temp.tail(1).index + X = pd.DataFrame() + X[['mktrf']] = temp[['mktrf']] + X['intercept'] = 1 + X = X[['intercept', 'mktrf']] + X = np.mat(X) + Y = np.mat(temp[['exret']]) + res = (np.identity(rolling_window) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + res_var = res.var(ddof=1) + df.loc[index, 'rvar'] = res_var + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_res_var, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling +crsp = crsp.rename(columns={'rvar': 'rvar_capm'}) +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'rvar_capm']] + +with open('rvar_capm.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/rvar_ff3.py b/char60/rvar_ff3.py new file mode 100755 index 0000000..36561a0 --- /dev/null +++ b/char60/rvar_ff3.py @@ -0,0 +1,201 @@ +# Fama & French 3 factors residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf, b.smb, b.hml + from crsp.dsf as a + left join ff.factors_daily as b + on a.date=b.date + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate the beta # +###################### +# function that get multiple beta +'''' +rolling_window = 60 # 60 trading days +crsp['beta_mktrf'] = np.nan +crsp['beta_smb'] = np.nan +crsp['beta_hml'] = np.nan + + +def get_beta(df): + """ + The original idea of calculate beta is using formula (X'MX)^(-1)X'MY, + where M = I - 1(1'1)^{-1}1, I is a identity matrix. + + """ + temp = crsp.loc[df.index] # extract the rolling sub dataframe from original dataframe + X = np.mat(temp[['mktrf', 'smb', 'hml']]) + Y = np.mat(temp[['exret']]) + ones = np.mat(np.ones(rolling_window)).T + M = np.identity(rolling_window) - ones.dot((ones.T.dot(ones)).I).dot(ones.T) + beta = (X.T.dot(M).dot(X)).I.dot((X.T.dot(M).dot(Y))) + crsp['beta_mktrf'].loc[df.index[-1:]] = beta[0] + crsp['beta_smb'].loc[df.index[-1:]] = beta[1] + crsp['beta_hml'].loc[df.index[-1:]] = beta[2] + return 0 # we do not need the rolling outcome since rolling cannot return different values in different columns + + +# calculate beta through rolling window +crsp_temp = crsp.groupby('permno').rolling(rolling_window).apply(get_beta, raw=False) +''' + +###################### +# Calculate residual # +###################### + + +def get_res_var(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + # if observations in last 3 months are less 21, we drop the rvar of this month + if temp['permno'].count() < 21: + pass + else: + rolling_window = temp['permno'].count() + index = temp.tail(1).index + X = pd.DataFrame() + X[['mktrf', 'smb', 'hml']] = temp[['mktrf', 'smb', 'hml']] + X['intercept'] = 1 + X = X[['intercept', 'mktrf', 'smb', 'hml']] + X = np.mat(X) + Y = np.mat(temp[['exret']]) + res = (np.identity(rolling_window) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + res_var = res.var(ddof=1) + df.loc[index, 'rvar'] = res_var + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_res_var, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling +crsp = crsp.rename(columns={'rvar': 'rvar_ff3'}) +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'rvar_ff3']] + +with open('rvar_ff3.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/rvar_mean.py b/char60/rvar_mean.py new file mode 100755 index 0000000..42297f4 --- /dev/null +++ b/char60/rvar_mean.py @@ -0,0 +1,150 @@ +# RVAR mean + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select permno, date, ret + from crsp.dsf + where date >= '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate variance # +###################### + + +def get_ret_var(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + # if observations in last 3 months are less 21, we drop the rvar of this month + if temp['permno'].count() < 21: + pass + else: + index = temp.tail(1).index + ret_var = temp['ret'].var() + df.loc[index, 'rvar'] = ret_var + return df + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_ret_var, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling +crsp = crsp.rename(columns={'rvar': 'rvar_mean'}) +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'rvar_mean']] + +with open('rvar_mean.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/std_dolvol.py b/char60/std_dolvol.py old mode 100644 new mode 100755 diff --git a/char60/std_turn.py b/char60/std_turn.py old mode 100644 new mode 100755 diff --git a/char60/sue.py b/char60/sue.py new file mode 100755 index 0000000..8238cdb --- /dev/null +++ b/char60/sue.py @@ -0,0 +1,106 @@ +# Calculate HSZ Replicating Anomalies +# SUE: Standardized Unexpected Earnings (Earnings surprise) + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +from pandasql import * +import pickle as pkl + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +################### +# Compustat Block # +################### +comp = conn.raw_sql(""" + select gvkey, datadate, fyearq, fqtr, epspxq, ajexq + from comp.fundq + where indfmt = 'INDL' + and datafmt = 'STD' + and popsrc = 'D' + and consol = 'C' + and datadate >= '01/01/1959' + """) + +comp['datadate'] = pd.to_datetime(comp['datadate']) + +################### +# CCM Block # +################### +ccm = conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where linktype in ('LU', 'LC') + """) + +ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) +ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) +# if linkenddt is missing then set to today date +ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) + +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) + +# set link date bounds +ccm2 = ccm1[(ccm1['datadate']>=ccm1['linkdt']) & (ccm1['datadate']<=ccm1['linkenddt'])] +ccm2 = ccm2[['gvkey', 'permno', 'datadate', 'fyearq', 'fqtr', 'epspxq', 'ajexq']] + +# the time series of exspxq/ajexq +ccm2['eps'] = ccm2['epspxq']/ccm2['ajexq'] +ccm2.drop_duplicates(['permno', 'datadate'], inplace=True) + +# merge lag1 to lag9, then calculate stand deviation +ccm2 = ccm2[ccm2['eps'].notna()] +ccm2['count'] = ccm2.groupby('permno').cumcount() + 1 +ccm2.sort_values(by=['permno', 'datadate'], inplace=True) + +ccm2['e1'] = ccm2.groupby(['permno'])['eps'].shift(1) +ccm2['e2'] = ccm2.groupby(['permno'])['eps'].shift(2) +ccm2['e3'] = ccm2.groupby(['permno'])['eps'].shift(3) +ccm2['e4'] = ccm2.groupby(['permno'])['eps'].shift(4) +ccm2['e5'] = ccm2.groupby(['permno'])['eps'].shift(5) +ccm2['e6'] = ccm2.groupby(['permno'])['eps'].shift(6) +ccm2['e7'] = ccm2.groupby(['permno'])['eps'].shift(7) +ccm2['e8'] = ccm2.groupby(['permno'])['eps'].shift(8) + +condlist = [ccm2['count']<=6, + ccm2['count']==7, + ccm2['count']==8, + ccm2['count']>=9] +choicelist = [np.nan, + ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3']].std(axis=1), + ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3', 'e2']].std(axis=1), + ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3', 'e2', 'e1']].std(axis=1)] +ccm2['sue_std'] = np.select(condlist, choicelist, default=np.nan) + +ccm2['sue'] = (ccm2['eps'] - ccm2['e4'])/ccm2['sue_std'] + +# populate the quarterly sue to monthly +crsp_msf = conn.raw_sql(""" + select distinct date + from crsp.msf + where date >= '01/01/1959' + """) + +ccm2['datadate'] = pd.to_datetime(ccm2['datadate']) +ccm2['plus12m'] = ccm2['datadate'] + np.timedelta64(12, 'M') +ccm2['plus12m'] = ccm2['plus12m'] + MonthEnd(0) + +df = sqldf("""select a.*, b.date + from ccm2 a left join crsp_msf b + on a.datadate <= b.date + and a.plus12m >= b.date + order by a.permno, b.date, a.datadate desc;""", globals()) + +df = df.drop_duplicates(['permno', 'date']) +df['datadate'] = pd.to_datetime(df['datadate']) +df = df[['gvkey', 'permno', 'datadate', 'date', 'sue']] + +with open('sue.pkl', 'wb') as f: + pkl.dump(df, f) \ No newline at end of file diff --git a/char60/zerotrade.py b/char60/zerotrade.py old mode 100644 new mode 100755 diff --git a/py-dgtw/.DS_Store b/py-dgtw/.DS_Store old mode 100644 new mode 100755 diff --git a/py-dgtw/dgtw.py b/py-dgtw/dgtw.py old mode 100644 new mode 100755 diff --git a/py-ff3/ff3.py b/py-ff3/ff3.py old mode 100644 new mode 100755 diff --git a/py-iclink/.DS_Store b/py-iclink/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 1 Mar 2021 21:58:39 +0800 Subject: [PATCH 11/15] update --- ...ription_0301.csv => Chars60_descrption.csv | 0 char60/accounting_100.py | 1643 +++++++++++++++++ 2 files changed, 1643 insertions(+) rename Chars_Description_0301.csv => Chars60_descrption.csv (100%) create mode 100644 char60/accounting_100.py diff --git a/Chars_Description_0301.csv b/Chars60_descrption.csv similarity index 100% rename from Chars_Description_0301.csv rename to Chars60_descrption.csv diff --git a/char60/accounting_100.py b/char60/accounting_100.py new file mode 100644 index 0000000..cf88aa6 --- /dev/null +++ b/char60/accounting_100.py @@ -0,0 +1,1643 @@ +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import pickle as pkl +from functions import * + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +####################################################################################################################### +# TTM functions # +####################################################################################################################### + + +def ttm4(series, df): + """ + + :param series: variables' name + :param df: dataframe + :return: ttm4 + """ + lag = pd.DataFrame() + for i in range(1, 4): + lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('gvkey')['%s' % series].shift(i) + result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] + return result + + +def ttm12(series, df): + """ + + :param series: variables' name + :param df: dataframe + :return: ttm12 + """ + lag = pd.DataFrame() + for i in range(1, 12): + lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('permno')['%s' % series].shift(i) + result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] +\ + lag['%s4' % series] + lag['%s5' % series] + lag['%s6' % series] + lag['%s7' % series] +\ + lag['%s8' % series] + lag['%s9' % series] + lag['%s10' % series] + lag['%s11' % series] + return result + +print('TTM') +####################################################################################################################### +# Compustat Block # +####################################################################################################################### +comp = conn.raw_sql(""" + /*header info*/ + select c.gvkey, f.cusip, f.datadate, f.fyear, c.cik, substr(c.sic,1,2) as sic2, c.sic, c.naics, + + /*firm variables*/ + /*income statement*/ + f.sale, f.revt, f.cogs, f.xsga, f.dp, f.xrd, f.xad, f.ib, f.ebitda, + f.ebit, f.nopi, f.spi, f.pi, f.txp, f.ni, f.txfed, f.txfo, f.txt, f.xint, f.xpp, f.xacc, + + /*CF statement and others*/ + f.capx, f.oancf, f.dvt, f.ob, f.gdwlia, f.gdwlip, f.gwo, f.mib, f.oiadp, f.ivao, f.ivst, + + /*assets*/ + f.rect, f.act, f.che, f.ppegt, f.invt, f.at, f.aco, f.intan, f.ao, f.ppent, f.gdwl, f.fatb, f.fatl, + + /*liabilities*/ + f.lct, f.dlc, f.dltt, f.lt, f.dm, f.dcvt, f.cshrc, + f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, f.dltis, f.dltr, f.dlcch, + + /*equity and other*/ + f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, + f.dpc, f.ajex, f.tstkp, f.oibdp, f.capxv, f.dvpa, f.epspx, + + /*market*/ + abs(f.prcc_f) as prcc_f, abs(f.prcc_c) as prcc_c, f.dvc, f.prstkc, f.sstk, f.fopt, f.wcap + + from comp.funda as f + left join comp.company as c + on f.gvkey = c.gvkey + + /*get consolidated, standardized, industrial format statements*/ + where f.indfmt = 'INDL' + and f.datafmt = 'STD' + and f.popsrc = 'D' + and f.consol = 'C' + and f.datadate >= '01/01/1959' + """) + +# convert datadate to date fmt +comp['datadate'] = pd.to_datetime(comp['datadate']) + +# sort and clean up +comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() + +# clean up csho +comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) + +# calculate Compustat market equity +comp['mve_f'] = comp['csho'] * comp['prcc_f'] + +# do some clean up. several variables have lots of missing values +condlist = [comp['drc'].notna() & comp['drlt'].notna(), + comp['drc'].notna() & comp['drlt'].isnull(), + comp['drlt'].notna() & comp['drc'].isnull()] +choicelist = [comp['drc']+comp['drlt'], + comp['drc'], + comp['drlt']] +comp['dr'] = np.select(condlist, choicelist, default=np.nan) + +condlist = [comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].notna() & comp['dcpstk'] > comp['pstk'], + comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].isnull()] +choicelist = [comp['dcpstk']-comp['pstk'], + comp['dcpstk']] +comp['dc'] = np.select(condlist, choicelist, default=np.nan) +comp['dc'] = np.where(comp['dc'].isnull(), comp['dcvt'], comp['dc']) + +comp['xint0'] = np.where(comp['xint'].isnull(), 0, comp['xint']) +comp['xsga0'] = np.where(comp['xsga'].isnull, 0, 0) + +comp['ceq'] = np.where(comp['ceq'] == 0, np.nan, comp['ceq']) +comp['at'] = np.where(comp['at'] == 0, np.nan, comp['at']) +comp = comp.dropna(subset=['at']) +print('compustat') +####################################################################################################################### +# CRSP Block # +####################################################################################################################### +# Create a CRSP Subsample with Monthly Stock and Event Variables +# Restrictions will be applied later +# Select variables from the CRSP monthly stock and event datasets +crsp = conn.raw_sql(""" + select a.prc, a.ret, a.retx, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.date, a.permno, a.permco, + b.ticker, b.ncusip, b.shrcd, b.exchcd + from crsp.msf as a + left join crsp.msenames as b + on a.permno=b.permno + and b.namedt<=a.date + and a.date<=b.nameendt + where a.date >= '01/01/1959' + and b.exchcd between 1 and 3 + """) + +# change variable format to int +crsp[['permco', 'permno', 'shrcd', 'exchcd']] = crsp[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) +crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month + +crsp = crsp.dropna(subset=['prc']) +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity + +# if Market Equity is Nan then let return equals to 0 +crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) +crsp['retx'] = np.where(crsp['me'].isnull(), 0, crsp['retx']) + +# impute me +crsp = crsp.sort_values(by=['permno', 'date']).drop_duplicates() +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) + +# Aggregate Market Cap +''' +There are cases when the same firm (permco) has two or more securities (permno) at same date. +For the purpose of ME for the firm, we aggregated all ME for a given permco, date. +This aggregated ME will be assigned to the permno with the largest ME. +''' +# sum of me across different permno belonging to same permco a given date +crsp_summe = crsp.groupby(['monthend', 'permco'])['me'].sum().reset_index() +# largest mktcap within a permco/date +crsp_maxme = crsp.groupby(['monthend', 'permco'])['me'].max().reset_index() +# join by monthend/maxme to find the permno +crsp1 = pd.merge(crsp, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) +# drop me column and replace with the sum me +crsp1 = crsp1.drop(['me'], axis=1) +# join with sum of me to get the correct market cap info +crsp2 = pd.merge(crsp1, crsp_summe, how='inner', on=['monthend', 'permco']) +# sort by permno and date and also drop duplicates +crsp2 = crsp2.sort_values(by=['permno', 'monthend']).drop_duplicates() +print('crsp') +####################################################################################################################### +# CCM Block # +####################################################################################################################### +# merge CRSP and Compustat +# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/linking-databases/linking-crsp-and-compustat/ +ccm = conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where substr(linktype,1,1)='L' + and (linkprim ='C' or linkprim='P') + """) + +ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) +ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) + +# if linkenddt is missing then set to today date +ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) + +# merge ccm and comp +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) + +# we can only get the accounting data after the firm public their report +# for annual data, we use 5 or 6 months lagged data +ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) +ccm1['jdate'] = ccm1['datadate'] + MonthEnd(4) + +# set link date bounds +ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] + +# link comp and crsp +crsp2 = crsp2.rename(columns={'monthend': 'jdate'}) +data_rawa = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) + +# filter exchcd & shrcd +data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & + ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] + +# process Market Equity +''' +Note: me is CRSP market equity, mve_f is Compustat market equity. Please choose the me below. +''' +data_rawa['me'] = data_rawa['me']/1000 # CRSP ME +# data_rawa['me'] = data_rawa['mve_f'] # Compustat ME + +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) +data_rawa = data_rawa.dropna(subset=['me']) + +# count single stock years +# data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() + +# deal with the duplicates +data_rawa.loc[data_rawa.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 +data_rawa = data_rawa[data_rawa['temp'].notna()] +data_rawa.loc[data_rawa.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 +data_rawa = data_rawa[data_rawa['temp'].notna()] + +data_rawa = data_rawa.sort_values(by=['permno', 'jdate']) +print('ccm') +####################################################################################################################### +# Annual Variables # +####################################################################################################################### +# stockholders' equity +data_rawa['se'] = np.where(data_rawa['seq'].isnull(), data_rawa['ceq']+data_rawa['pstk'], data_rawa['seq']) +data_rawa['se'] = np.where(data_rawa['se'].isnull(), data_rawa['at']-data_rawa['lt'], data_rawa['se']) + +data_rawa['txditc'] = data_rawa['txditc'].fillna(0) + +# preferrerd stock +data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps']) + +# book equity +data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] +data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) + +# acc +data_rawa['act_l1'] = data_rawa.groupby(['permno'])['act'].shift(1) +data_rawa['lct_l1'] = data_rawa.groupby(['permno'])['lct'].shift(1) + +condlist = [data_rawa['np'].isnull(), + data_rawa['act'].isnull() | data_rawa['lct'].isnull()] +choicelist = [((data_rawa['act']-data_rawa['lct'])-(data_rawa['act_l1']-data_rawa['lct_l1'])/(10*data_rawa['be'])), + (data_rawa['ib']-data_rawa['oancf'])/(10*data_rawa['be'])] +data_rawa['acc'] = np.select(condlist, + choicelist, + default=((data_rawa['act']-data_rawa['lct']+data_rawa['np'])- + (data_rawa['act_l1']-data_rawa['lct_l1']+data_rawa['np'].shift(1)))/(10*data_rawa['be'])) + +# agr +data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1) +data_rawa['agr'] = (data_rawa['at']-data_rawa['at_l1'])/data_rawa['at_l1'] + +# bm +# data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] + +# cfp +# condlist = [data_rawa['dp'].isnull(), +# data_rawa['ib'].isnull()] +# choicelist = [data_rawa['ib']/data_rawa['me'], +# np.nan] +# data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) + +# ep, checked from Hou and change 'ME' from compustat to crsp +#data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] +#data_rawa['ep_n'] = data_rawa['ib'] + +# ni +data_rawa['csho_l1'] = data_rawa.groupby(['permno'])['csho'].shift(1) +data_rawa['ajex_l1'] = data_rawa.groupby(['permno'])['ajex'].shift(1) +data_rawa['ni'] = np.where(data_rawa['gvkey'] != data_rawa['gvkey'].shift(1), + np.nan, + np.log(data_rawa['csho']*data_rawa['ajex']).replace(-np.inf, 0)- + np.log(data_rawa['csho_l1']*data_rawa['ajex_l1']).replace(-np.inf, 0)) + +# op: the formula seems different from Hou Page 74? +data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) +data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) +data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga']) + +condlist = [data_rawa['revt'].isnull(), data_rawa['be'].isnull()] +choicelist = [np.nan, np.nan] +data_rawa['op'] = np.select(condlist, choicelist, + default=(data_rawa['revt'] - data_rawa['cogs0'] - data_rawa['xsga0'] - data_rawa['xint0'])/data_rawa['be']) + + + +# rsup +data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) +# data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] + +# cash +data_rawa['cash'] = data_rawa['che']/data_rawa['at'] + +# lev +# data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] + +# sp, checked +# data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] +#data_rawa['sp_n'] = data_rawa['sale'] + +# rd_sale +data_rawa['rd_sale'] = data_rawa['xrd']/data_rawa['sale'] + +# rdm +# data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] + +# adm hxz adm, checked +# data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] + +# gma +data_rawa['gma'] = (data_rawa['revt']-data_rawa['cogs'])/data_rawa['at_l1'] + +# chcsho +data_rawa['chcsho'] = (data_rawa['csho']/data_rawa['csho_l1'])-1 + +# lgr +data_rawa['lt_l1'] = data_rawa.groupby(['permno'])['lt'].shift(1) +data_rawa['lgr'] = (data_rawa['lt']/data_rawa['lt_l1'])-1 + +# pctacc +data_rawa['che_l1'] = data_rawa.groupby(['permno'])['che'].shift(1) +data_rawa['dlc_l1'] = data_rawa.groupby(['permno'])['dlc'].shift(1) +data_rawa['txp_l1'] = data_rawa.groupby(['permno'])['txp'].shift(1) + +condlist = [data_rawa['ib']==0, + data_rawa['oancf'].isnull(), + data_rawa['oancf'].isnull() & data_rawa['ib']==0] +choicelist = [(data_rawa['ib']-data_rawa['oancf'])/0.01, + ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1']))- + ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1']- + ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))/data_rawa['ib'].abs(), + ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1'])) - + ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1'] - + ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))] +data_rawa['pctacc'] = np.select(condlist, choicelist, default=(data_rawa['ib']-data_rawa['oancf'])/data_rawa['ib'].abs()) + +# sgr +data_rawa['sgr'] = (data_rawa['sale']/data_rawa['sale_l1'])-1 + +# chato +data_rawa['at_l2'] = data_rawa.groupby(['permno'])['at'].shift(2) +data_rawa['chato'] = (data_rawa['sale']/((data_rawa['at']+data_rawa['at_l1'])/2))-\ + (data_rawa['sale_l1']/((data_rawa['at']+data_rawa['at_l2'])/2)) + +# chtx +data_rawa['txt_l1'] = data_rawa.groupby(['permno'])['txt'].shift(1) +data_rawa['chtx'] = (data_rawa['txt']-data_rawa['txt_l1'])/data_rawa['at_l1'] + +# noa,checked +data_rawa['noa'] = ((data_rawa['at']-data_rawa['che']-data_rawa['ivao'].fillna(0))- + (data_rawa['at']-data_rawa['dlc'].fillna(0)-data_rawa['dltt'].fillna(0)-data_rawa['mib'].fillna(0) + -data_rawa['pstk'].fillna(0)-data_rawa['ceq'])/data_rawa['at_l1']) + +# rna +data_rawa['noa_l1'] = data_rawa.groupby(['permno'])['noa'].shift(1) +data_rawa['rna'] = data_rawa['oiadp']/data_rawa['noa_l1'] + +# pm +data_rawa['pm'] = data_rawa['oiadp']/data_rawa['sale'] + +# ato +data_rawa['ato'] = data_rawa['sale']/data_rawa['noa_l1'] + +# depr +data_rawa['depr'] = data_rawa['dp']/data_rawa['ppent'] + +# invest +data_rawa['ppent_l1'] = data_rawa.groupby(['permno'])['ppent'].shift(1) +data_rawa['invt_l1'] = data_rawa.groupby(['permno'])['invt'].shift(1) + +data_rawa['invest'] = np.where(data_rawa['ppegt'].isnull(), ((data_rawa['ppent']-data_rawa['ppent_l1'])+ + (data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1'], + ((data_rawa['ppegt']-data_rawa['ppent_l1'])+(data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1']) + +# egr +data_rawa['ceq_l1'] = data_rawa.groupby(['permno'])['ceq'].shift(1) +data_rawa['egr'] = ((data_rawa['ceq']-data_rawa['ceq_l1'])/data_rawa['ceq_l1']) + +# cashdebt +data_rawa['cashdebt'] = (data_rawa['ib']+data_rawa['dp'])/((data_rawa['lt']+data_rawa['lt_l1'])/2) + +# rd +# if ((xrd/at)-(lag(xrd/lag(at))))/(lag(xrd/lag(at))) >.05 then rd=1 else rd=0 +data_rawa['xrd/at_l1'] = data_rawa['xrd']/data_rawa['at_l1'] +data_rawa['xrd/at_l1_l1'] = data_rawa.groupby(['permno'])['xrd/at_l1'].shift(1) +data_rawa['rd'] = np.where(((data_rawa['xrd']/data_rawa['at'])- + (data_rawa['xrd/at_l1_l1']))/data_rawa['xrd/at_l1_l1']>0.05, 1, 0) + +# roa +data_rawa['roa'] = data_rawa['ni']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# roe +data_rawa['roe'] = data_rawa['ib']/data_rawa['ceq_l1'] + +# dy +# data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] + +################## Added on 2020.07.28 ################## + +# roic +data_rawa['roic'] = (data_rawa['ebit'] - data_rawa['nopi'])/(data_rawa['ceq'] + data_rawa['lt'] - data_rawa['che']) + +# chinv +data_rawa['chinv'] = (data_rawa['invt'] - data_rawa['invt_l1'])/((data_rawa['at'] + data_rawa['at_l2'])/2) + +# pchsale_pchinvt +data_rawa['pchsale_pchinvt'] = ((data_rawa['sale'] - data_rawa['sale_l1'])/data_rawa['sale_l1'])\ + - ((data_rawa['invt']-data_rawa['invt_l1'])/data_rawa['invt_l1']) + +# pchsale_pchrect +data_rawa['rect_l1'] = data_rawa.groupby(['permno'])['rect'].shift(1) +data_rawa['pchsale_pchrect'] = ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale_l1'])\ + - ((data_rawa['rect']-data_rawa['rect_l1'])/data_rawa['rect_l1']) + +# pchgm_pchsale +data_rawa['cogs_l1'] = data_rawa.groupby(['permno'])['cogs'].shift(1) +data_rawa['pchgm_pchsale'] = (((data_rawa['sale']-data_rawa['cogs']) + - (data_rawa['sale_l1']-data_rawa['cogs_l1']))/(data_rawa['sale_l1']-data_rawa['cogs_l1']))\ + - ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale']) + +# pchsale_pchxsga +data_rawa['xsga_l1'] = data_rawa.groupby(['permno'])['xsga'].shift(1) +data_rawa['pchsale_pchxsga'] = ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale_l1'])\ + - ((data_rawa['xsga']-data_rawa['xsga_l1'])/data_rawa['xsga_l1']) + +# pchdepr +data_rawa['dp_l1'] = data_rawa.groupby(['permno'])['dp'].shift(1) +data_rawa['pchdepr'] = ((data_rawa['dp']/data_rawa['ppent'])-(data_rawa['dp_l1'] + /data_rawa['ppent_l1']))\ + / (data_rawa['dp_l1']/data_rawa['ppent']) + +# chadv +data_rawa['xad_l1'] = data_rawa.groupby(['permno'])['xad'].shift(1) +data_rawa['chadv'] = np.log(data_rawa['xad'] + 1) - np.log(data_rawa['xad_l1'] + 1) + +# pchcapx +data_rawa['capx_l1'] = data_rawa.groupby(['permno'])['capx'].shift(1) +data_rawa['pchcapx'] = (data_rawa['capx']-data_rawa['capx_l1'])/data_rawa['capx_l1'] + +# grcapx +data_rawa['capx_l2'] = data_rawa.groupby(['permno'])['capx'].shift(2) +data_rawa['grcapx'] = (data_rawa['capx']-data_rawa['capx_l2'])/data_rawa['capx_l2'] + +# grGW +data_rawa['gdwl_l1'] = data_rawa.groupby(['permno'])['gdwl'].shift(1) +data_rawa['grGW'] = (data_rawa['gdwl']-data_rawa['gdwl_l1'])/data_rawa['gdwl'] +condlist = [(data_rawa['gdwl']==0) | (data_rawa['gdwl'].isnull()), + (data_rawa['gdwl'].notna()) & (data_rawa['gdwl'] != 0) & (data_rawa['grGW'].isnull())] +choicelist = [0, 1] +data_rawa['grGW'] = np.select(condlist, choicelist, default=data_rawa['grGW']) + +# currat +data_rawa['currat'] = data_rawa['act']/data_rawa['lct'] + +# pchcurrat +data_rawa['pchcurrat'] = ((data_rawa['act']/data_rawa['lct'])-(data_rawa['act_l1']/data_rawa['lct_l1']))\ + /(data_rawa['act_l1']/data_rawa['lct_l1']) + +# quick +data_rawa['quick'] = (data_rawa['act']-data_rawa['invt'])/data_rawa['lct'] + +# pchquick +data_rawa['pchquick'] = ((data_rawa['act']-data_rawa['invt'])/data_rawa['lct'] + -(data_rawa['act_l1']-data_rawa['invt_l1'])/data_rawa['lct_l1'])\ + /((data_rawa['act_l1']-data_rawa['invt_l1'])/data_rawa['lct_l1']) + +# salecash +data_rawa['salecash'] = data_rawa['sale']/data_rawa['che'] + +# salerec +data_rawa['salerec']= data_rawa['sale']/data_rawa['rect'] + +# saleinv +data_rawa['saleinv'] = data_rawa['sale']/data_rawa['invt'] + +# pchsaleinv +data_rawa['pchsaleinv'] = ((data_rawa['sale']/data_rawa['invt'])-(data_rawa['sale_l1']/data_rawa['invt_l1']))\ + /(data_rawa['sale_l1']/data_rawa['invt_l1']) + +# realestate +data_rawa['realestate'] = (data_rawa['fatb']+data_rawa['fatl'])/data_rawa['ppegt'] +data_rawa['realestate'] = np.where(data_rawa['ppegt'].isnull(), + (data_rawa['fatb']+data_rawa['fatl'])/data_rawa['ppent'], data_rawa['realestate']) + +# obklg +data_rawa['obklg'] = data_rawa['ob']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# chobklg +data_rawa['ob_l1'] = data_rawa.groupby(['permno'])['ob'].shift(1) +data_rawa['chobklg'] = (data_rawa['ob'] - data_rawa['ob_l1'])/((data_rawa['at']+data_rawa['at_l1'])/2) + +# grltnoa +data_rawa['aco_l1'] = data_rawa.groupby(['permno'])['aco'].shift(1) +data_rawa['intan_l1'] = data_rawa.groupby(['permno'])['intan'].shift(1) +data_rawa['ao_l1'] = data_rawa.groupby(['permno'])['ao'].shift(1) +data_rawa['ap_l1'] = data_rawa.groupby(['permno'])['ap'].shift(1) +data_rawa['lco_l1'] = data_rawa.groupby(['permno'])['lco'].shift(1) +data_rawa['lo_l1'] = data_rawa.groupby(['permno'])['lo'].shift(1) +data_rawa['rect_l1'] = data_rawa.groupby(['permno'])['rect'].shift(1) + +data_rawa['grltnoa'] = ((data_rawa['rect']+data_rawa['invt']+data_rawa['ppent']+data_rawa['aco']+data_rawa['intan']+ + data_rawa['ao']-data_rawa['ap']-data_rawa['lco']-data_rawa['lo']) + -(data_rawa['rect_l1']+data_rawa['invt_l1']+data_rawa['ppent_l1']+data_rawa['aco_l1'] + +data_rawa['intan_l1']+data_rawa['ao_l1']-data_rawa['ap_l1']-data_rawa['lco_l1'] + -data_rawa['lo_l1']) + -(data_rawa['rect']-data_rawa['rect_l1']+data_rawa['invt']-data_rawa['invt_l1'] + +data_rawa['aco']-data_rawa['aco_l1'] + -(data_rawa['ap']-data_rawa['ap_l1']+data_rawa['lco']-data_rawa['lco_l1'])-data_rawa['dp']))\ + /((data_rawa['at']+data_rawa['at_l1'])/2) + +# conv +data_rawa['conv'] = data_rawa['dc']/data_rawa['dltt'] + +# chdrc +data_rawa['dr_l1'] = data_rawa.groupby(['permno'])['dr'].shift(1) +data_rawa['chdrc'] = (data_rawa['dr']-data_rawa['dr_l1'])/((data_rawa['at']+data_rawa['at_l1'])/2) + +# rdbias +data_rawa['xrd_l1'] = data_rawa.groupby(['permno'])['xrd'].shift(1) +data_rawa['rdbias'] = (data_rawa['xrd']/data_rawa['xrd_l1'])-1-data_rawa['ib']/data_rawa['ceq_l1'] + +# operprof +data_rawa['operprof'] = (data_rawa['revt']-data_rawa['cogs']-data_rawa['xsga0']-data_rawa['xint0'])/data_rawa['ceq_l1'] + +# cfroa +data_rawa['cfroa'] = data_rawa['oancf']/((data_rawa['at']+data_rawa['at_l1'])/2) +data_rawa['cfroa'] = np.where(data_rawa['oancf'].isnull(), + (data_rawa['ib'] + data_rawa['dp'])/((data_rawa['at']+data_rawa['at_l1'])/2), + data_rawa['cfroa']) + +# xrdint +data_rawa['xrdint'] = data_rawa['xrd']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# capxint +data_rawa['capxint'] = data_rawa['capx']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# xadint +data_rawa['xadint'] = data_rawa['xad']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# chpm +data_rawa['ib_l1'] = data_rawa.groupby(['permno'])['ib'].shift(1) +data_rawa['chpm'] = (data_rawa['ib']/data_rawa['sale'])-(data_rawa['ib_l1']/data_rawa['sale_l1']) + +# ala +data_rawa['ala'] = data_rawa['che']+0.75*(data_rawa['act']-data_rawa['che'])-\ + 0.5*(data_rawa['at']-data_rawa['act']-data_rawa['gdwl']-data_rawa['intan']) + +# alm +data_rawa['alm'] = data_rawa['ala']/(data_rawa['at']+data_rawa['prcc_f']*data_rawa['csho']-data_rawa['ceq']) + +# hire +data_rawa['emp_l1'] = data_rawa.groupby(['permno'])['emp'].shift(1) +data_rawa['hire'] = (data_rawa['emp'] - data_rawa['emp_l1'])/data_rawa['emp_l1'] +data_rawa['hire'] = np.where((data_rawa['emp'].isnull()) | (data_rawa['emp_l1'].isnull()), 0, data_rawa['hire']) + +# herf +data_rawa['sic'] = data_rawa['sic'].astype(int) +data_rawa['ffi49'] = ffi49(data_rawa) +data_rawa['ffi49'] = data_rawa['ffi49'].fillna(49) +data_rawa['ffi49'] = data_rawa['ffi49'].astype(int) +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['sale'].sum() +df_temp = df_temp.rename(columns={'sale': 'indsale'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +data_rawa['herf'] = (data_rawa['sale']/data_rawa['indsale'])*(data_rawa['sale']/data_rawa['indsale']) +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['herf'].sum() +data_rawa = data_rawa.drop(['herf'], axis=1) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) + +################################## Added on 2020.10.29 ################################## +# Bmj +data_rawa['be_per'] = data_rawa['be'] / data_rawa['csho'] +data_rawa['bmj'] = data_rawa['be_per'] / data_rawa['prc'] +############### *Q*: used prc as share price from crsp ########## + +# Cp +data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] +#data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'] + +# Dp +###### *Q* difference return with without divident + +# Dur +# me = data_rawa['me_comp'] + + +# Ebp +data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) +data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) +data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] +data_rawa['f_asse'] = data_rawa['che'] +# net debt : = financial liabilities - financial assets. +data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] +data_rawa['ber'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa'] +#data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['ber']) / (data_rawa['n_debt']+data_rawa['me']) + + +# Em +#data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che'] +#data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'] + +############### Investment ############### +# Aci +data_rawa['ce'] = data_rawa['capx'] / data_rawa['sale'] +data_rawa['ce1'] = data_rawa['ce'].shift(1) +data_rawa['ce2'] = data_rawa['ce'].shift(2) +data_rawa['ce3'] = data_rawa['ce'].shift(3) +data_rawa['aci'] = data_rawa['ce']/ (data_rawa['ce1']+data_rawa['ce2']+data_rawa['ce3'])-1 + +# Cei +#data_rawa['lg_me'] = np.log(data_rawa['me']/data_rawa['me'].shift(6)) +#data_rawa['lg_ret'] = np.log(data_rawa['ret']*data_rawa['ret'].shift(1)*data_rawa['ret'].shift(2)*data_rawa['ret'].shift(3)*data_rawa['ret'].shift(5)*data_rawa['ret'].shift(6)) +#data_rawa['cei'] = data_rawa['lg_me'] - data_rawa['lg_ret'] + + +# Dac + + + +# dCoa +data_rawa['coa'] = data_rawa['act'] - data_rawa['che'] +data_rawa['dcoa'] = (data_rawa['coa']-data_rawa['coa'].shift(1)) / data_rawa['at'].shift(1) + + +# dBe +data_rawa['dBe'] = (data_rawa['ceq'] - data_rawa['ceq'].shift(1)) / data_rawa['at'].shift(1) + + +# dFnl & dFin +data_rawa['fna'] = data_rawa['ivst'] + data_rawa['ivao'] +data_rawa['fnl'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + +data_rawa['d_dlc'] = data_rawa['dlc'] - data_rawa['dlc'].shift(1) +data_rawa['d_dlc'] = np.where(data_rawa['d_dlc'].isnull(), 0, data_rawa['d_dlc']) +data_rawa['d_pstk'] = data_rawa['pstk'] - data_rawa['pstk'].shift(1) +data_rawa['d_pstk'] = np.where(data_rawa['d_pstk'].isnull(), 0, data_rawa['d_pstk']) + +data_rawa['dfnl'] = (data_rawa['dltt']-data_rawa['dltt'].shift(1)) + data_rawa['d_dlc'] + data_rawa['d_pstk'] + +data_rawa['d_ivst'] = data_rawa['ivst'] - data_rawa['ivst'].shift(1) +data_rawa['d_ivst'] = np.where(data_rawa['d_ivst'].isnull(), 0, data_rawa['d_ivst']) +data_rawa['d_ivao'] = data_rawa['ivao'] - data_rawa['ivao'].shift(1) +data_rawa['d_ivao'] = np.where(data_rawa['d_ivao'].isnull(), 0, data_rawa['d_ivao']) + +data_rawa['dfna'] = data_rawa['d_ivst'] + data_rawa['d_ivao'] +data_rawa['dfin'] = data_rawa['dfna'] - data_rawa['dfnl'] + +data_rawa['dfin'] = data_rawa['dfin'] / data_rawa['at'].shift(1) +data_rawa['dfnl'] = data_rawa['dfnl'] / data_rawa['at'].shift(1) + + + + +# dIi +data_rawa['e_invt'] = (data_rawa['capxv'] + data_rawa['capxv'].shift(1))/2 +data_rawa['dinvt'] = (data_rawa['capxv'] - data_rawa['e_invt']) / data_rawa['e_invt'] + +data_rawa['ind'] = data_rawa['capxv'] +s = data_rawa.groupby(['jdate', 'sic2'])['ind'].sum() +data_rawa = pd.merge(data_rawa, s, on=['jdate', 'sic2']) +# new industry investment will be named as ind_y, cause it's been grouped by ind +data_rawa['e_ind'] = (data_rawa['ind_y'] + data_rawa['ind_y'].shift(1))/2 +data_rawa['dind'] = (data_rawa['ind_y']-data_rawa['e_ind']) / data_rawa['e_ind'] +data_rawa['dIi'] = data_rawa['dinvt'] - data_rawa['dind'] + +# dLno +data_rawa['dlno'] = (data_rawa['ppent']-data_rawa['ppent'].shift(1)) + (data_rawa['intan']-data_rawa['intan'].shift(1)) + (data_rawa['ao']-data_rawa['ao'].shift(1)) - (data_rawa['lo']-data_rawa['lo'].shift(1)) + data_rawa['dp'] +avg_at = [] +for i in range(data_rawa.shape[0]): + avg_at.append(data_rawa.loc[0:i, 'at'].mean()) +data_rawa['avg_at'] = pd.DataFrame(avg_at) +data_rawa['dlno'] = data_rawa['dlno'] / data_rawa['avg_at'] + + +# dNco +data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao'] +data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt'] +data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] +data_rawa['dnco'] = data_rawa['nco'] - data_rawa['nco'].shift(1) + + +# dNca +data_rawa['ivao_0'] = np.where(data_rawa['ivao'].isnull(), 0, data_rawa['ivao']) +data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) + +data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao_0'] +data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt_0'] +data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] +data_rawa['dnca'] = data_rawa['nco'] - data_rawa['nco'].shift(1) + + + +# dNoa +data_rawa['dlc_0'] = np.where(data_rawa['dlc'].isnull(), 0, data_rawa['dlc']) +data_rawa['mib_0'] = np.where(data_rawa['mib'].isnull(), 0, data_rawa['mib']) +data_rawa['pstk_0'] = np.where(data_rawa['pstk'].isnull(), 0, data_rawa['pstk']) + +data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] +data_rawa['op_lia'] = data_rawa['at'] - data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] +data_rawa['net_op'] = data_rawa['op_at'] - data_rawa['op_lia'] +data_rawa['dnoa'] = (data_rawa['net_op']-data_rawa['net_op'].shift(1))/ data_rawa['at'].shift(1) + + +# dPia +data_rawa['c_propty'] = data_rawa['ppegt'] - data_rawa['ppegt'].shift(1) +data_rawa['c_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) +data_rawa['dpia'] = (data_rawa['c_propty'] + data_rawa['c_invt']) / data_rawa['at'].shift(1) + + + + + +######### Profitability ########## +# Ato,repeated +#data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] - data_rawa['ivao_0'] +#data_rawa['op_lia'] = data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] +#data_rawa['noa'] = data_rawa['op_at'] - data_rawa['op_lia'] +#data_rawa['ato'] = data_rawa['sale'] / data_rawa['noa'].shift(1) + + +# Cla +data_rawa['d_rect'] = data_rawa['rect'] - data_rawa['rect'].shift(1) +data_rawa['d_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) +data_rawa['d_xpp'] = data_rawa['xpp'] - data_rawa['xpp'].shift(1) +data_rawa['d_dr'] = (data_rawa['drc']-data_rawa['drc'].shift(1)) + (data_rawa['drlt']-data_rawa['drlt'].shift(1)) +data_rawa['d_ap'] = data_rawa['ap'] - data_rawa['ap'].shift(1) +data_rawa['d_xacc'] = data_rawa['xacc'] - data_rawa['xacc'].shift(1) + +data_rawa['xrd_0'] = np.where(data_rawa['xrd'].isnull(), 0, data_rawa['xrd']) +data_rawa['d_rect_0'] = np.where(data_rawa['d_rect'].isnull(), 0, data_rawa['d_rect']) +data_rawa['d_invt_0'] = np.where(data_rawa['d_invt'].isnull(), 0, data_rawa['d_invt']) +data_rawa['d_xpp_0'] = np.where(data_rawa['d_xpp'].isnull(), 0, data_rawa['d_xpp']) +data_rawa['d_dr_0'] = np.where(data_rawa['d_dr'].isnull(), 0, data_rawa['d_dr']) +data_rawa['d_ap_0'] = np.where(data_rawa['d_ap'].isnull(), 0, data_rawa['d_ap']) +data_rawa['d_xacc_0'] = np.where(data_rawa['d_xacc'].isnull(), 0, data_rawa['d_xacc']) + +data_rawa['cla'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ + - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ + + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] +data_rawa['cla'] = data_rawa['cla'] / data_rawa['at'].shift(1) + + +# Cop +data_rawa['cop'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ + - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ + + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] +data_rawa['cop'] = data_rawa['cop'] / data_rawa['at'] + + +# Cto +data_rawa['cto'] = data_rawa['sale'] / data_rawa['at'].shift(1) + +#ir +''' +#First calculate r(t-5,t). Then rb(t-5,t) and use Bm to perform linear regression and get residue +''' +#r(t-5,t):sum ret from t-5 to t (which is calendar year t-6 to t-1) +lag = pd.DataFrame() +for i in range(1,6): + lag['ret%s' % i] = data_rawa.groupby(['permno'])['ret'].shift(i) + +data_rawa['ret5'] = lag['ret1']+lag['ret2']+lag['ret3']+lag['ret4']+lag['ret5'] + +#bm_t-5 (bm of year t-5) +#data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) + +#rB (five year log book return) +#Reference: jf_06 page8 by KENT DANIEL +#data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] + +#Regression and get ir +#First get unique datelist +#datelist = data_rawa['jdate'].unique() +#for date in datelist: +# temp = data_rawa['jdate' == date] +# n_row = temp.shape[0] +# index = temp.index +# X = pd.DataFrame() +# X['bm5'] = temp['bm5'] +# X['rB'] = temp['rB'] +# X['intercept'] = 1 +# X = X[['intercept','rB','bm5']] +# X = np.mat(X) +# Y = np.mat(temp[['ret5']]) + #These are residuals on one date +# res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) +# #put residuals back into data_rawa +# data_rawa.loc[index,'ir'] = res + +#nop +#data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] +#data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] +#data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] ) + +#ocp +#data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf']) +#data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] +#data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) + +#dwc +data_rawa['dwc'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc']) +#data_rawa['dwc'] = data_rawa['dwc']/data_rawa['at_l1'] + +#I/A +data_rawa['ia'] = (data_rawa['at']/data_rawa['at_l1'])-1 + +#Ig +data_rawa['capx_l1'] = data_rawa.groupby('permno')['capx'].shift(1) +data_rawa['ig'] = data_rawa['capx']/data_rawa['capx_l1'] + +#2Ig +data_rawa['capx_l2'] = data_rawa.groupby('permno')['capx'].shift(2) +data_rawa['2ig'] = data_rawa['capx']/data_rawa['capx_l2'] + +#Ivc +data_rawa['atAvg'] = (data_rawa['at']+data_rawa['at_l1'])/2 +data_rawa['ivc'] = data_rawa['invt'] / data_rawa['atAvg'] + +#Ndf +data_rawa['ndf'] = data_rawa['dltis'] - data_rawa['dltr'] + data_rawa['dlcch'] + +#nsi +data_rawa['sps'] = data_rawa['csho'] * data_rawa['ajex'] +data_rawa['sps_l1'] = data_rawa.groupby('permno')['sps'].shift(1) +data_rawa['nsi'] = np.log(data_rawa['sps']/data_rawa['sps_l1']) + +#oa +data_rawa['txp'] = np.where(data_rawa['txp'].isnull(), 0, data_rawa['txp']) +data_rawa['oa'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc'] - data_rawa['txp']) - data_rawa['dp'] + +#Poa +data_rawa['poa'] = data_rawa['oa']/data_rawa['ni'] + +#Ta +data_rawa['ta'] = data_rawa['dwc'] + data_rawa['dnco'] + data_rawa['dfin'] + +#Ol +data_rawa['ol'] = (data_rawa['cogs'] + data_rawa['xsga'])/data_rawa['at'] + +#etr +data_rawa['txtpi'] = data_rawa['txt'] / data_rawa['pi'] +data_rawa['txtpi_l1'] = data_rawa.groupby('permno')['txtpi'].shift(1) +data_rawa['txtpi_l2'] = data_rawa.groupby('permno')['txtpi'].shift(2) +data_rawa['txtpi_l3'] = data_rawa.groupby('permno')['txtpi'].shift(3) +data_rawa['deps'] = data_rawa['epspx']/(data_rawa['ajex'] * data_rawa['prcc_f']) +data_rawa['etr'] = (data_rawa['txtpi'] - (data_rawa['txtpi_l1'] + data_rawa['txtpi_l2'] + data_rawa['txtpi_l3'])/3) * data_rawa['deps'] + +print('annual') +####################################################################################################################### +# Compustat Quarterly Raw Info # +####################################################################################################################### +comp = conn.raw_sql(""" + /*header info*/ + select c.gvkey, f.cusip, f.datadate, f.fyearq, substr(c.sic,1,2) as sic2, c.sic, f.fqtr, f.rdq, + + /*income statement*/ + f.ibq, f.saleq, f.txtq, f.revtq, f.cogsq, f.xsgaq, f.revty, f.cogsy, f.saley, + + /*balance sheet items*/ + f.atq, f.actq, f.cheq, f.lctq, f.dlcq, f.ppentq, f.ppegtq, + + /*others*/ + abs(f.prccq) as prccq, abs(f.prccq)*f.cshoq as mveq_f, f.ceqq, f.seqq, f.pstkq, f.ltq, + f.pstkrq, f.gdwlq, f.intanq, f.mibq, f.oiadpq, f.ivaoq, + + /* v3 my formula add*/ + f.ajexq, f.cshoq, f.txditcq, f.npq, f.xrdy, f.xrdq, f.dpq, f.xintq, f.invtq, f.scstkcy, f.niq, + f.oancfy, f.dlttq, f.rectq, f.acoq, f.apq, f.lcoq, f.loq, f.aoq + + from comp.fundq as f + left join comp.company as c + on f.gvkey = c.gvkey + + /*get consolidated, standardized, industrial format statements*/ + where f.indfmt = 'INDL' + and f.datafmt = 'STD' + and f.popsrc = 'D' + and f.consol = 'C' + and f.datadate >= '01/01/1959' + """) + +# comp['cusip6'] = comp['cusip'].str.strip().str[0:6] +comp = comp.dropna(subset=['ibq']) + +# sort and clean up +comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() +comp['cshoq'] = np.where(comp['cshoq'] == 0, np.nan, comp['cshoq']) +comp['ceqq'] = np.where(comp['ceqq'] == 0, np.nan, comp['ceqq']) +comp['atq'] = np.where(comp['atq'] == 0, np.nan, comp['atq']) +comp = comp.dropna(subset=['atq']) + +# convert datadate to date fmt +comp['datadate'] = pd.to_datetime(comp['datadate']) + +# merge ccm and comp +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) +ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) +ccm1['jdate'] = ccm1['datadate'] + MonthEnd(3) # we change quarterly lag here +# ccm1['jdate'] = ccm1['datadate']+MonthEnd(4) + +# set link date bounds +ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] + +# merge ccm2 and crsp2 +# crsp2['jdate'] = crsp2['monthend'] +data_rawq = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) + +# filter exchcd & shrcd +data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & + ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] + +# process Market Equity +''' +Note: me is CRSP market equity, mveq_f is Compustat market equity. Please choose the me below. +''' +data_rawq['me'] = data_rawq['me']/1000 # CRSP ME +# data_rawq['me'] = data_rawq['mveq_f'] # Compustat ME + +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) +data_rawq = data_rawq.dropna(subset=['me']) + +# count single stock years +# data_rawq['count'] = data_rawq.groupby(['gvkey']).cumcount() + +# deal with the duplicates +data_rawq.loc[data_rawq.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 +data_rawq = data_rawq[data_rawq['temp'].notna()] +data_rawq.loc[data_rawq.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 +data_rawq = data_rawq[data_rawq['temp'].notna()] + +data_rawq = data_rawq.sort_values(by=['permno', 'jdate']) +print('quarterly raw') +####################################################################################################################### +# Quarterly Variables # +####################################################################################################################### +# prepare be +data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) +data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) + +# dy +# data_rawq['me_l1'] = data_rawq.groupby(['permno'])['me'].shift(1) +# data_rawq['retdy'] = data_rawq['ret'] - data_rawq['retx'] +# data_rawq['mdivpay'] = data_rawq['retdy']*data_rawq['me_l1'] +# +# data_rawq['dy'] = ttm12(series='mdivpay', df=data_rawq)/data_rawq['me'] + +# chtx +data_rawq['txtq_l4'] = data_rawq.groupby(['permno'])['txtq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) +data_rawq['chtx'] = (data_rawq['txtq']-data_rawq['txtq_l4'])/data_rawq['atq_l4'] + +# roa +data_rawq['atq_l1'] = data_rawq.groupby(['permno'])['atq'].shift(1) +data_rawq['roa'] = data_rawq['ibq']/data_rawq['atq_l1'] + +# cash +data_rawq['cash'] = data_rawq['cheq']/data_rawq['atq'] + +# acc +data_rawq['actq_l4'] = data_rawq.groupby(['permno'])['actq'].shift(4) +data_rawq['lctq_l4'] = data_rawq.groupby(['permno'])['lctq'].shift(4) +data_rawq['npq_l4'] = data_rawq.groupby(['permno'])['npq'].shift(4) +condlist = [data_rawq['npq'].isnull(), + data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] +choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/(10*data_rawq['beq']), + np.nan] +data_rawq['acc'] = np.select(condlist, choicelist, + default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])- + (data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/(10*data_rawq['beq'])) + +# bm +# data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] + +# cfp +data_rawq['ibq4'] = ttm4('ibq', data_rawq) +data_rawq['dpq4'] = ttm4('dpq', data_rawq) +# data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), +# data_rawq['ibq4']/data_rawq['me'], +# (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) + +# ep +# data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] + +# agr +data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'] + +# ni +data_rawq['cshoq_l4'] = data_rawq.groupby(['permno'])['cshoq'].shift(4) +data_rawq['ajexq_l4'] = data_rawq.groupby(['permno'])['ajexq'].shift(4) +data_rawq['ni'] = np.where(data_rawq['cshoq'].isnull(), np.nan, + np.log(data_rawq['cshoq']*data_rawq['ajexq']).replace(-np.inf, 0)-np.log(data_rawq['cshoq_l4']*data_rawq['ajexq_l4'])) + +# op +data_rawq['xintq0'] = np.where(data_rawq['xintq'].isnull(), 0, data_rawq['xintq']) +data_rawq['xsgaq0'] = np.where(data_rawq['xsgaq'].isnull(), 0, data_rawq['xsgaq']) +data_rawq['beq_l4'] = data_rawq.groupby(['permno'])['beq'].shift(4) + +data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] + +# csho +data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1 + +# cashdebt +data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4) +data_rawq['cashdebt'] = (ttm4('ibq', data_rawq) + ttm4('dpq', data_rawq))/((data_rawq['ltq']+data_rawq['ltq_l4'])/2) + +# rd +data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) +data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4']) + +data_rawq['xrdq4/atq_l4'] = data_rawq['xrdq4']/data_rawq['atq_l4'] +data_rawq['xrdq4/atq_l4_l4'] = data_rawq.groupby(['permno'])['xrdq4/atq_l4'].shift(4) +data_rawq['rd'] = np.where(((data_rawq['xrdq4']/data_rawq['atq'])-data_rawq['xrdq4/atq_l4_l4'])/data_rawq['xrdq4/atq_l4_l4']>0.05, 1, 0) + +# pctacc +condlist = [data_rawq['npq'].isnull(), + data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] +choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/abs(ttm4('ibq', data_rawq)), np.nan] +data_rawq['pctacc'] = np.select(condlist, choicelist, + default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/ + abs(ttm4('ibq', data_rawq))) + +# gma +data_rawq['revtq4'] = ttm4('revtq', data_rawq) +data_rawq['cogsq4'] = ttm4('cogsq', data_rawq) +data_rawq['gma'] = (data_rawq['revtq4']-data_rawq['cogsq4'])/data_rawq['atq_l4'] + +# lev +# data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] + +# rdm +# data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] + +# sgr +data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) + +data_rawq['saleq4_l4'] = data_rawq.groupby(['permno'])['saleq4'].shift(4) +data_rawq['sgr'] = (data_rawq['saleq4']/data_rawq['saleq4_l4'])-1 + +# sp +# data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] + +# invest +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) +data_rawq['ppegtq_l4'] = data_rawq.groupby(['permno'])['ppegtq'].shift(4) + +data_rawq['invest'] = np.where(data_rawq['ppegtq'].isnull(), ((data_rawq['ppentq']-data_rawq['ppentq_l4'])+ + (data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4'], + ((data_rawq['ppegtq']-data_rawq['ppegtq_l4'])+(data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4']) + +# rd_sale +data_rawq['rd_sale'] = data_rawq['xrdq4']/data_rawq['saleq4'] + +# lgr +data_rawq['lgr'] = (data_rawq['ltq']/data_rawq['ltq_l4'])-1 + +# depr +data_rawq['depr'] = ttm4('dpq', data_rawq)/data_rawq['ppentq'] + +# egr +data_rawq['ceqq_l4'] = data_rawq.groupby(['permno'])['ceqq'].shift(4) +data_rawq['egr'] = (data_rawq['ceqq']-data_rawq['ceqq_l4'])/data_rawq['ceqq_l4'] + +# chpm +data_rawq['ibq4_l1'] = data_rawq.groupby(['permno'])['ibq4'].shift(1) +data_rawq['saleq4_l1'] = data_rawq.groupby(['permno'])['saleq4'].shift(1) + +data_rawq['chpm'] = (data_rawq['ibq4']/data_rawq['saleq4'])-(data_rawq['ibq4_l1']/data_rawq['saleq4_l1']) + +# chato +data_rawq['atq_l8'] = data_rawq.groupby(['permno'])['atq'].shift(8) +data_rawq['chato'] = (data_rawq['saleq4']/((data_rawq['atq']+data_rawq['atq_l4'])/2))-(data_rawq['saleq4_l4']/((data_rawq['atq_l4']+data_rawq['atq_l8'])/2)) + +# noa +data_rawq['ivaoq'] = np.where(data_rawq['ivaoq'].isnull(), 0, 1) +data_rawq['dlcq'] = np.where(data_rawq['dlcq'].isnull(), 0, 1) +data_rawq['dlttq'] = np.where(data_rawq['dlttq'].isnull(), 0, 1) +data_rawq['mibq'] = np.where(data_rawq['mibq'].isnull(), 0, 1) +data_rawq['pstkq'] = np.where(data_rawq['pstkq'].isnull(), 0, 1) +data_rawq['noa'] = (data_rawq['atq']-data_rawq['cheq']-data_rawq['ivaoq'])-\ + (data_rawq['atq']-data_rawq['dlcq']-data_rawq['dlttq']-data_rawq['mibq']-data_rawq['pstkq']-data_rawq['ceqq'])/data_rawq['atq_l4'] + +# rna +data_rawq['noa_l4'] = data_rawq.groupby(['permno'])['noa'].shift(4) +data_rawq['rna'] = data_rawq['oiadpq']/data_rawq['noa_l4'] + +# pm +data_rawq['pm'] = data_rawq['oiadpq']/data_rawq['saleq'] + +# ato +data_rawq['ato'] = data_rawq['saleq']/data_rawq['noa_l4'] + +# roe +data_rawq['ceqq_l1'] = data_rawq.groupby(['permno'])['ceqq'].shift(1) +data_rawq['roe'] = data_rawq['ibq']/data_rawq['ceqq_l1'] + +################################## New Added ################################## + +# grltnoa +data_rawq['rectq_l4'] = data_rawq.groupby(['permno'])['rectq'].shift(4) +data_rawq['acoq_l4'] = data_rawq.groupby(['permno'])['acoq'].shift(4) +data_rawq['apq_l4'] = data_rawq.groupby(['permno'])['apq'].shift(4) +data_rawq['lcoq_l4'] = data_rawq.groupby(['permno'])['lcoq'].shift(4) +data_rawq['loq_l4'] = data_rawq.groupby(['permno'])['loq'].shift(4) +data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) + +data_rawq['grltnoa'] = ((data_rawq['rectq']+data_rawq['invtq']+data_rawq['ppentq']+data_rawq['acoq']+data_rawq['intanq']+ + data_rawq['aoq']-data_rawq['apq']-data_rawq['lcoq']-data_rawq['loq'])- + (data_rawq['rectq_l4']+data_rawq['invtq_l4']+data_rawq['ppentq_l4']+data_rawq['acoq_l4']-data_rawq['apq_l4']-data_rawq['lcoq_l4']-data_rawq['loq_l4'])-\ + (data_rawq['rectq']-data_rawq['rectq_l4']+data_rawq['invtq']-data_rawq['invtq_l4']+data_rawq['acoq']- + (data_rawq['apq']-data_rawq['apq_l4']+data_rawq['lcoq']-data_rawq['lcoq_l4'])- + ttm4('dpq', data_rawq)))/((data_rawq['atq']+data_rawq['atq_l4'])/2) + +# scal +# condlist = [data_rawq['seqq'].isnull(), +# data_rawq['seqq'].isnull() & (data_rawq['ceqq'].isnull() | data_rawq['pstk'].isnull())] +# choicelist = [data_rawq['ceqq']+data_rawq['pstk'], +# data_rawq['atq']-data_rawq['ltq']] +# data_rawq['scal'] = np.select(condlist, choicelist, default=data_rawq['seqq']) + +# ala +data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ + 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) + +# alm +# data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) + +# rsup +data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4) +# data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'] + +# stdsacc +data_rawq['actq_l1'] = data_rawq.groupby(['permno'])['actq'].shift(1) +data_rawq['cheq_l1'] = data_rawq.groupby(['permno'])['cheq'].shift(1) +data_rawq['lctq_l1'] = data_rawq.groupby(['permno'])['lctq'].shift(1) +data_rawq['dlcq_l1'] = data_rawq.groupby(['permno'])['dlcq'].shift(1) + +data_rawq['sacc'] = ((data_rawq['actq']-data_rawq['actq_l1'] - (data_rawq['cheq']-data_rawq['cheq_l1'])) + -((data_rawq['lctq']-data_rawq['lctq_l1'])-(data_rawq['dlcq']-data_rawq['dlcq_l1'])))/data_rawq['saleq'] +data_rawq['sacc'] = np.where(data_rawq['saleq']<=0, ((data_rawq['actq']-data_rawq['actq_l1'] - (data_rawq['cheq']-data_rawq['cheq_l1'])) + -((data_rawq['lctq']-data_rawq['lctq_l1'])-(data_rawq['dlcq']-data_rawq['dlcq_l1'])))/0.01, data_rawq['sacc']) + + +def chars_std(start, end, df, chars): + """ + + :param start: Order of starting lag + :param end: Order of ending lag + :param df: Dataframe + :param chars: lag chars + :return: std of factor + """ + lag = pd.DataFrame() + lag_list = [] + for i in range(start, end): + lag['chars_l%s' % i] = df.groupby(['permno'])['%s' % chars].shift(i) + lag_list.append('chars_l%s' % i) + result = lag[lag_list].std(axis=1) + return result + +data_rawq['stdacc'] = chars_std(0, 16, data_rawq, 'sacc') + +# sgrvol +# data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') + +# roavol +data_rawq['roavol'] = chars_std(0, 16, data_rawq, 'roa') + +# stdcf +data_rawq['scf'] = (data_rawq['ibq']/data_rawq['saleq']) - data_rawq['sacc'] +data_rawq['scf'] = np.where(data_rawq['saleq']<=0, (data_rawq['ibq']/0.01) - data_rawq['sacc'], data_rawq['sacc']) + +data_rawq['stdcf'] = chars_std(0, 16, data_rawq, 'scf') + +# cinvest +data_rawq['ppentq_l1'] = data_rawq.groupby(['permno'])['ppentq'].shift(1) +data_rawq['ppentq_l2'] = data_rawq.groupby(['permno'])['ppentq'].shift(2) +data_rawq['ppentq_l3'] = data_rawq.groupby(['permno'])['ppentq'].shift(3) +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['saleq_l1'] = data_rawq.groupby(['permno'])['saleq'].shift(1) +data_rawq['saleq_l2'] = data_rawq.groupby(['permno'])['saleq'].shift(2) +data_rawq['saleq_l3'] = data_rawq.groupby(['permno'])['saleq'].shift(3) + +data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / data_rawq['saleq_l1'] +data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / data_rawq['saleq_l2'] +data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / data_rawq['saleq_l3'] + +data_rawq['cinvest'] = ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / data_rawq['saleq'])\ + -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)) + +data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / 0.01 +data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / 0.01 +data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / 0.01 + +data_rawq['cinvest'] = np.where(data_rawq['saleq']<=0, ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / 0.01) + -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)), data_rawq['cinvest']) + +data_rawq = data_rawq.drop(['c_temp1', 'c_temp2', 'c_temp3'], axis=1) + +# nincr +data_rawq['ibq_l1'] = data_rawq.groupby(['permno'])['ibq'].shift(1) +data_rawq['ibq_l2'] = data_rawq.groupby(['permno'])['ibq'].shift(2) +data_rawq['ibq_l3'] = data_rawq.groupby(['permno'])['ibq'].shift(3) +data_rawq['ibq_l4'] = data_rawq.groupby(['permno'])['ibq'].shift(4) +data_rawq['ibq_l5'] = data_rawq.groupby(['permno'])['ibq'].shift(5) +data_rawq['ibq_l6'] = data_rawq.groupby(['permno'])['ibq'].shift(6) +data_rawq['ibq_l7'] = data_rawq.groupby(['permno'])['ibq'].shift(7) +data_rawq['ibq_l8'] = data_rawq.groupby(['permno'])['ibq'].shift(8) + +data_rawq['nincr_temp1'] = np.where(data_rawq['ibq'] > data_rawq['ibq_l1'], 1, 0) +data_rawq['nincr_temp2'] = np.where(data_rawq['ibq_l1'] > data_rawq['ibq_l2'], 1, 0) +data_rawq['nincr_temp3'] = np.where(data_rawq['ibq_l2'] > data_rawq['ibq_l3'], 1, 0) +data_rawq['nincr_temp4'] = np.where(data_rawq['ibq_l3'] > data_rawq['ibq_l4'], 1, 0) +data_rawq['nincr_temp5'] = np.where(data_rawq['ibq_l4'] > data_rawq['ibq_l5'], 1, 0) +data_rawq['nincr_temp6'] = np.where(data_rawq['ibq_l5'] > data_rawq['ibq_l6'], 1, 0) +data_rawq['nincr_temp7'] = np.where(data_rawq['ibq_l6'] > data_rawq['ibq_l7'], 1, 0) +data_rawq['nincr_temp8'] = np.where(data_rawq['ibq_l7'] > data_rawq['ibq_l8'], 1, 0) + +data_rawq['nincr'] = (data_rawq['nincr_temp1'] + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']*data_rawq['nincr_temp8'])) + +data_rawq = data_rawq.drop(['ibq_l1', 'ibq_l2', 'ibq_l3', 'ibq_l4', 'ibq_l5', 'ibq_l6', 'ibq_l7', 'ibq_l8', 'nincr_temp1', + 'nincr_temp2', 'nincr_temp3', 'nincr_temp4', 'nincr_temp5', 'nincr_temp6', 'nincr_temp7', + 'nincr_temp8'], axis=1) + +# performance score +data_rawq['niq4'] = ttm4(series='niq', df=data_rawq) +data_rawq['niq4_l4'] = data_rawq.groupby(['permno'])['niq4'].shift(4) +data_rawq['dlttq_l4'] = data_rawq.groupby(['permno'])['dlttq'].shift(4) +data_rawq['p_temp1'] = np.where(data_rawq['niq4']>0, 1, 0) +data_rawq['p_temp2'] = np.where(data_rawq['oancfy']>0, 1, 0) +data_rawq['p_temp3'] = np.where(data_rawq['niq4']/data_rawq['atq']>data_rawq['niq4_l4']/data_rawq['atq_l4'], 1, 0) +data_rawq['p_temp4'] = np.where(data_rawq['oancfy']>data_rawq['niq4'], 1, 0) +data_rawq['p_temp5'] = np.where(data_rawq['dlttq']/data_rawq['atq'] data_rawq['actq_l4']/data_rawq['lctq_l4'], 1, 0) +data_rawq['cogsq4_l4'] = data_rawq.groupby(['permno'])['cogsq4'].shift(4) +data_rawq['p_temp7'] = np.where((data_rawq['saleq4']-data_rawq['cogsq4']/data_rawq['saleq4'])>(data_rawq['saleq4_l4']-data_rawq['cogsq4_l4']/data_rawq['saleq4_l4']), 1, 0) +data_rawq['p_temp8'] = np.where(data_rawq['saleq4']/data_rawq['atq']>data_rawq['saleq4_l4']/data_rawq['atq_l4'], 1, 0) +data_rawq['p_temp9'] = np.where(data_rawq['scstkcy']==0, 1, 0) + +data_rawq['pscore'] = data_rawq['p_temp1']+data_rawq['p_temp2']+data_rawq['p_temp3']+data_rawq['p_temp4']\ + +data_rawq['p_temp5']+data_rawq['p_temp6']+data_rawq['p_temp7']+data_rawq['p_temp8']\ + +data_rawq['p_temp9'] + +data_rawq = data_rawq.drop(['p_temp1', 'p_temp2', 'p_temp3', 'p_temp4', 'p_temp5', 'p_temp6', 'p_temp7', 'p_temp8', + 'p_temp9'], axis=1) + +################################## Added on 2020.10.29 ################################## +#Iaq +data_rawq['atqlag'] = ttm4('atq',data_rawq) +data_rawq['iaq'] = (data_rawq['atq']/data_rawq['atqlag'])-1 + +#Almq +data_rawq['intanq'] = np.where(data_rawq['intanq'].isnull(), 0, data_rawq['intanq']) +data_rawq['qal'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq']) + 0.5*(data_rawq['atq'] - data_rawq['actq'] - data_rawq['intanq']) +data_rawq['mveqa'] = data_rawq['atq'] + data_rawq['mveq_f'] - data_rawq['ceqq'] +data_rawq['mveqa_1'] = data_rawq.groupby(['permno'])['mveqa'].shift(1) +data_rawq['almq'] = data_rawq['qal']/data_rawq['mveqa_1'] + +#Olq, needs atq +data_rawq['olq'] = (data_rawq['cogsq'] + data_rawq['xsgaq'])/data_rawq['atq'] + +# rds +data_rawq['rds'] = data_rawq['xrdq4']/data_rawq['saleq'] + +print('quarterly variables') +####################################################################################################################### +# Momentum # +####################################################################################################################### +crsp_mom = conn.raw_sql(""" + select permno, date, ret, retx, prc, shrout, vol + from crsp.msf + where date >= '01/01/1959' + """) + +crsp_mom['permno'] = crsp_mom['permno'].astype(int) +crsp_mom['date'] = pd.to_datetime(crsp_mom['date']) +crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) +crsp_mom = crsp_mom.dropna(subset=['ret', 'retx', 'prc']) + +# add delisting return +dlret = conn.raw_sql(""" + select permno, dlret, dlstdt + from crsp.msedelist + """) + +dlret.permno = dlret.permno.astype(int) +dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) +dlret['jdate'] = dlret['dlstdt'] + MonthEnd(0) + +# merge delisting return to crsp return +crsp_mom = pd.merge(crsp_mom, dlret, how='left', on=['permno', 'jdate']) +crsp_mom['dlret'] = crsp_mom['dlret'].fillna(0) +crsp_mom['ret'] = crsp_mom['ret'].fillna(0) +crsp_mom['retadj'] = (1 + crsp_mom['ret']) * (1 + crsp_mom['dlret']) - 1 +crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity +crsp_mom['retx'] = np.where(crsp_mom['me'].isnull(), 0, crsp_mom['retx']) +crsp_mom = crsp_mom.drop(['dlret', 'dlstdt'], axis=1)#delete prc,shrout + +#Seasonality + +#Rla +crsp_mom['rla'] = crsp_mom.groupby(['permno'])['ret'].shift(12) + +#Rln +lag = pd.DataFrame() +result = 0 +for i in range(1, 12): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['rln'] = result/11 + +#R[2,5]a +#R[2,5]n +lag = pd.DataFrame() +result = 0 +for i in range(13,61): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + if i not in [24,36,48,60]: + result = result + lag['mom%s' % i] + +crsp_mom['r25a'] = (lag['mom24']+lag['mom36']+lag['mom48']+lag['mom60'])/4 +crsp_mom['r25n'] = result/44 + +#R[6,10]a +#R[6,10]n +lag = pd.DataFrame() +result = 0 +for i in range(61,121): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + if i not in [72,84,96,108,120]: + result = result + lag['mom%s' % i] + +crsp_mom['r610a'] = (lag['mom72']+lag['mom84']+lag['mom96']+lag['mom108']+lag['mom120'])/5 +crsp_mom['r610n'] = result/55 + +#R[11,15]a +lag = pd.DataFrame() +result = 0 +for i in [132,144,156,168,180]: + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['r1115a'] = result/5 + +#R[16,20]a +lag = pd.DataFrame() +result = 0 +for i in [192,204,216,228,240]: + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['r1620a'] = result/5 + + +def mom(start, end, df): + """ + :param start: Order of starting lag + :param end: Order of ending lag + :param df: Dataframe + :return: Momentum factor + """ + lag = pd.DataFrame() + result = 1 + for i in range(start, end): + lag['mom%s' % i] = df.groupby(['permno'])['ret'].shift(i) + result = result * (1+lag['mom%s' % i]) + result = result - 1 + return result + + +crsp_mom['mom60m'] = mom(12, 60, crsp_mom) +crsp_mom['mom12m'] = mom(1, 12, crsp_mom) +crsp_mom['mom1m'] = crsp_mom['ret'] +crsp_mom['mom6m'] = mom(1, 6, crsp_mom) +crsp_mom['mom36m'] = mom(1, 36, crsp_mom) +crsp_mom['seas1a'] = crsp_mom.groupby(['permno'])['ret'].shift(11) + +crsp_mom['vol_l1'] = crsp_mom.groupby(['permno'])['vol'].shift(1) +crsp_mom['vol_l2'] = crsp_mom.groupby(['permno'])['vol'].shift(2) +crsp_mom['vol_l3'] = crsp_mom.groupby(['permno'])['vol'].shift(3) +crsp_mom['prc_l2'] = crsp_mom.groupby(['permno'])['prc'].shift(2) +crsp_mom['dolvol'] = np.log(crsp_mom['vol_l2']*crsp_mom['prc_l2']).replace([np.inf, -np.inf], np.nan) +crsp_mom['turn'] = ((crsp_mom['vol_l1']+crsp_mom['vol_l2']+crsp_mom['vol_l3'])/3)/crsp_mom['shrout'] + +# dy +crsp_mom['me_l1'] = crsp_mom.groupby(['permno'])['me'].shift(1) +crsp_mom['retdy'] = crsp_mom['ret'] - crsp_mom['retx'] +crsp_mom['mdivpay'] = crsp_mom['retdy']*crsp_mom['me_l1'] + +crsp_mom['dy'] = ttm12(series='mdivpay', df=crsp_mom)/crsp_mom['me'] + +# def moms(start, end, df): +# """ +# +# :param start: Order of starting lag +# :param end: Order of ending lag +# :param df: Dataframe +# :return: Momentum factor +# """ +# lag = pd.DataFrame() +# result = 1 +# for i in range(start, end): +# lag['moms%s' % i] = df.groupby['permno']['ret'].shift(i) +# result = result + lag['moms%s' % i] +# result = result/11 +# return result +# +# +# crsp_mom['moms12m'] = moms(1, 12, crsp_mom) + +# populate the chars to monthly +print('momentum') +# data_rawa +data_rawa = data_rawa.drop(['date', 'ret', 'retx', 'me'], axis=1) +data_rawa = pd.merge(crsp_mom, data_rawa, how='left', on=['permno', 'jdate']) +data_rawa['datadate'] = data_rawa.groupby(['permno'])['datadate'].fillna(method='ffill') +data_rawa = data_rawa.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') +data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & + ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] +print('data_rawa') +# data_rawq +data_rawq = data_rawq.drop(['date', 'ret', 'retx', 'me'], axis=1) +data_rawq = pd.merge(crsp_mom, data_rawq, how='left', on=['permno', 'jdate']) +data_rawq['datadate'] = data_rawq.groupby(['permno'])['datadate'].fillna(method='ffill') +data_rawq = data_rawq.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') +data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & + ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] +print('data_rawq') +####################################################################################################################### +# Monthly ME # +####################################################################################################################### + +######################################## +# Annual # +######################################## + +# bm +data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] +#data_rawa['bm_n'] = data_rawa['be'] + +# bm_ia +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() +df_temp = df_temp.rename(columns={'bm': 'bm_ind'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +data_rawa['bm_ia'] = data_rawa['bm']/data_rawa['bm_ind'] + +# me_ia +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['me'].mean() +df_temp = df_temp.rename(columns={'me': 'me_ind'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +data_rawa['me_ia'] = data_rawa['me']/data_rawa['me_ind'] + +# cfp +condlist = [data_rawa['dp'].isnull(), + data_rawa['ib'].isnull()] +choicelist = [data_rawa['ib']/data_rawa['me'], + np.nan] +data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) + +# ep, checked from Hou and change 'ME' from compustat to crsp,checked +data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] +#data_rawa['ep_n'] = data_rawa['ib'] + +# rsup +# data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) +data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] + +# lev +data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] + +# sp, checked +data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] +#data_rawa['sp_n'] = data_rawa['sale'] + +# rdm +data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] + +# adm hxz adm,checked +data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] + +# dy +data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] + +# Cp +#data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] +data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'] + +# Ebp +#data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) +#data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) +#data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] +#data_rawa['f_asse'] = data_rawa['che'] +# net debt : = financial liabilities - financial assets. +#data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] +#data_rawa['ber'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa'] +data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['ber']) / (data_rawa['n_debt']+data_rawa['me']) + +# Em +data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che'] +data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'] + +# Cei +data_rawa['lg_me'] = np.log(data_rawa['me']/data_rawa['me'].shift(6)) +data_rawa['lg_ret'] = np.log(data_rawa['ret']*data_rawa['ret'].shift(1)*data_rawa['ret'].shift(2)*data_rawa['ret'].shift(3)*data_rawa['ret'].shift(5)*data_rawa['ret'].shift(6)) +data_rawa['cei'] = data_rawa['lg_me'] - data_rawa['lg_ret'] + +#nop +data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] +data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] +data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] ) + +#ocp +data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf']) +data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] +data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) + +#bm_t-5 (bm of year t-5) +data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) + +#rB (five year log book return) +#Reference: jf_06 page8 by KENT DANIEL +data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] + +#Regression and get ir +#First get unique datelist +datelist = data_rawa['jdate'].unique() +for date in datelist: + temp = data_rawa[data_rawa['jdate'] == date] + n_row = temp.shape[0] + index = temp.index + X = pd.DataFrame() + X['bm5'] = temp['bm5'] + X['rB'] = temp['rB'] + X['intercept'] = 1 + X = X[['intercept','rB','bm5']] + X = np.mat(X) + Y = np.mat(temp[['ret5']]) + #These are residuals on one date + res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + #put residuals back into data_rawa + data_rawa.loc[index,'ir'] = res + +# Annual Accounting Variables +chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', + 'sic', 'retadj', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', 'rsup', 'cash', 'chcsho', + 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'adm', 'sgr', 'sp', 'invest', 'roe', + 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'chato', 'chtx', 'noa', 'rna', 'pm', 'ato', 'dy', + 'roic', 'chinv', 'pchsale_pchinvt', 'pchsale_pchrect', 'pchgm_pchsale', 'pchsale_pchxsga', + 'pchdepr', 'chadv', 'pchcapx', 'grcapx', 'grGW', 'currat', 'pchcurrat', 'quick', 'pchquick', + 'salecash', 'salerec', 'saleinv', 'pchsaleinv', 'realestate', 'obklg', 'chobklg', 'grltnoa', + 'conv', 'chdrc', 'rdbias', 'operprof', 'capxint', 'xadint', 'chpm', 'ala', 'alm', + 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'hire', 'herf', 'bm_ia', + 'me_ia', 'bmj','cp', 'ebp', 'em', 'dp', 'aci', 'dpia', 'dBe', 'dfnl', 'dfin', 'dcoa', + 'dlno', 'dnoa', 'cla', 'cop', 'cto', 'dIi', 'dnco', 'dnca', 'ir', 'nop', 'ocp', + 'ia', 'ig','2ig','ivc','ndf','nsi','oa','poa','ta','ol','etr']] + +chars_a.reset_index(drop=True, inplace=True) +print(chars_a) +print('ME annual') +######################################## +# Quarterly # +######################################## +# bm +data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] + +# cfp +data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), + data_rawq['ibq4']/data_rawq['me'], + (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) + +# ep +data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] + +# lev +data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] + +# rdm +data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] + +# sp +data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] + +# alm +data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) + +# rsup +# data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4) +data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'] + +# sgrvol +data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') + +# Quarterly Accounting Variables +chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd','retadj' ,'acc', 'bm', 'cfp', + 'ep', 'agr', 'ni', 'op', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', + 'rdm', 'sgr', 'sp', 'invest', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'roe', + 'chato', 'chpm', 'chtx', 'noa', 'rna', 'pm', 'ato', 'stdcf', + 'grltnoa', 'ala', 'alm', 'rsup', 'stdacc', 'sgrvol', 'roavol', 'scf', 'cinvest', + 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'pscore', 'nincr', + 'turn', 'dolvol', 'iaq', 'almq', 'olq', 'rds']] + +chars_q.reset_index(drop=True, inplace=True) +print(chars_q) +print('ME quarterly') +with open('chars_a_60.pkl', 'wb') as f: + pkl.dump(chars_a, f) +print('pkl a') +with open('chars_q_60.pkl', 'wb') as f: + pkl.dump(chars_q, f) +print('pkl q') +print('Finished') \ No newline at end of file From 8929cbf7407d483c1bd45649432cce9a0ea28350 Mon Sep 17 00:00:00 2001 From: velonisa Date: Mon, 1 Mar 2021 22:03:28 +0800 Subject: [PATCH 12/15] Create .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9bea433 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ + +.DS_Store From 1227d9c46409277e7d8913ce09bb0bcc02c17600 Mon Sep 17 00:00:00 2001 From: velonisa Date: Mon, 1 Mar 2021 22:21:33 +0800 Subject: [PATCH 13/15] delete --- .gitignore | 1 + Chars60_descrption.csv | 507 -------- README.md | 109 -- char60/.DS_Store | Bin 6148 -> 0 bytes char60/abr.py | 236 ---- char60/accounting_100.py | 1643 ------------------------- char60/accounting_60.py | 1215 ------------------ char60/beta.py | 164 --- char60/bid_ask_spread.py | 160 --- char60/functions.py | 452 ------- char60/iclink.py | 241 ---- char60/ill.py | 158 --- char60/impute_rank_output_bchmk_60.py | 164 --- char60/maxret_d.py | 158 --- char60/merge_chars_60.py | 294 ----- char60/pkl_to_csv.py | 29 - char60/re.py | 120 -- char60/rvar_capm.py | 168 --- char60/rvar_ff3.py | 201 --- char60/rvar_mean.py | 150 --- char60/std_dolvol.py | 158 --- char60/std_turn.py | 158 --- char60/sue.py | 106 -- char60/zerotrade.py | 161 --- py-dgtw/.DS_Store | Bin 6148 -> 0 bytes py-dgtw/dgtw.py | 479 ------- py-ff3/ff3.py | 280 ----- py-pead/pead.py | 538 -------- pychars/.DS_Store | Bin 6148 -> 0 bytes pychars/accounting.py | 851 ------------- pychars/beta.py | 70 -- pychars/functions.py | 445 ------- pychars/hxz_abr.py | 236 ---- pychars/hxz_re.py | 120 -- pychars/hxz_sue.py | 106 -- pychars/iclink.py | 241 ---- pychars/impute_rank_output.py | 114 -- pychars/merge_chars.py | 86 -- pychars/rvar_capm.py | 168 --- pychars/rvar_ff3.py | 201 --- pychars/rvar_mean.py | 150 --- qsub/.DS_Store | Bin 6148 -> 0 bytes qsub/check_crsp.sas | 2 - qsub/submit.sh | 11 - setup-wrds.py | 11 - 45 files changed, 1 insertion(+), 10861 deletions(-) delete mode 100644 Chars60_descrption.csv delete mode 100755 README.md delete mode 100755 char60/.DS_Store delete mode 100755 char60/abr.py delete mode 100644 char60/accounting_100.py delete mode 100755 char60/accounting_60.py delete mode 100755 char60/beta.py delete mode 100755 char60/bid_ask_spread.py delete mode 100755 char60/functions.py delete mode 100755 char60/iclink.py delete mode 100755 char60/ill.py delete mode 100755 char60/impute_rank_output_bchmk_60.py delete mode 100755 char60/maxret_d.py delete mode 100755 char60/merge_chars_60.py delete mode 100755 char60/pkl_to_csv.py delete mode 100755 char60/re.py delete mode 100755 char60/rvar_capm.py delete mode 100755 char60/rvar_ff3.py delete mode 100755 char60/rvar_mean.py delete mode 100755 char60/std_dolvol.py delete mode 100755 char60/std_turn.py delete mode 100755 char60/sue.py delete mode 100755 char60/zerotrade.py delete mode 100755 py-dgtw/.DS_Store delete mode 100755 py-dgtw/dgtw.py delete mode 100755 py-ff3/ff3.py delete mode 100755 py-pead/pead.py delete mode 100755 pychars/.DS_Store delete mode 100755 pychars/accounting.py delete mode 100755 pychars/beta.py delete mode 100755 pychars/functions.py delete mode 100755 pychars/hxz_abr.py delete mode 100755 pychars/hxz_re.py delete mode 100755 pychars/hxz_sue.py delete mode 100755 pychars/iclink.py delete mode 100755 pychars/impute_rank_output.py delete mode 100755 pychars/merge_chars.py delete mode 100755 pychars/rvar_capm.py delete mode 100755 pychars/rvar_ff3.py delete mode 100755 pychars/rvar_mean.py delete mode 100755 qsub/.DS_Store delete mode 100755 qsub/check_crsp.sas delete mode 100755 qsub/submit.sh delete mode 100755 setup-wrds.py diff --git a/.gitignore b/.gitignore index 9bea433..43322df 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .DS_Store +.DS_Store diff --git a/Chars60_descrption.csv b/Chars60_descrption.csv deleted file mode 100644 index 835b03f..0000000 --- a/Chars60_descrption.csv +++ /dev/null @@ -1,507 +0,0 @@ -Num,Acronym,Description,Author,Pub Year,Category,Main Formula,Other Formula,CRSP,Compustat(annual),Compustat(quarterly),IBES,description -A.1.2,Abr1,"cumulative abnormal returns around earnings announcement dates, 1-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,,,,,,p63 -A.1.2,Abr12,"cumulative abnormal returns around earnings announcement dates, 6-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,,,,,, -A.1.2,Abr6,"cumulative abnormal returns around earnings announcement dates, 12-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,,,,,, -A.2.1,Bm,Book-to-market equity,"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth,data_rawq['bm'] = data_rawq['beq']/data_rawq['me'],"data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) -data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) -* 'me' from rawq",1,0,1,0,"At the end of June of each year t, we split stocks into deciles based on Bm, which is the book equity for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Bm. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Following Davis, Fama, and French (2000), we measure book equity as stockholders' book equity, plus balance sheet deferred taxes and investment tax credit (Compustat annual item TXDITC) if available, minus the book value of preferred stock. Stockholders' equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders' equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depending on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock." -A.2.10,"Ep q 1, Ep q 6, and Ep q 12","Q Quarterly Earnings-to-price(1-month holding period), uarterly Earnings-to-price(6-month holding period), Quarterly Earnings-to-price(12-month holding period), ",Basu,1983,Value-versus-growth,,,,,,,"At the beginning of each month t, we split stocks into deciles based on quarterly earnings-to-price, Epq, which is income before extraordinary items (Compustat quarterly item IBQ) divided by the market equity (from CRSP) at the end of month t - 1. Before 1972, we use quarterly earnings from fiscal quarters ending at least four months prior to the portfolio formation. Starting from 1972, we use quarterly earnings from the most recent quarterly earnings announcement dates (item RDQ). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent quarterly earnings to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Firms with non-positive earnings are excluded. For firms with more than one share class, we merge the market equity for all share classes before computing Epq. We calculate decile returns for the current month t (Epq1), from month t to t + 5 (Epq6), and from month t to t + 11 (Epq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Epq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Epq6 decile." -A.2.12,Cp,Cash flow-to-price,"Lakonishok, Shleifer, and Vishny ",1994,Value-versus-growth,data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'],"data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] -* 'me' from rawa -",1,1,0,0,"At the end of June of each year t, we split stocks into deciles based on cash flow-to-price, Cf, which is cash flows for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. Cash flows are income before extraordinary items (Com- pustat annual item IB) plus depreciation (item DP)). For firms with more than one share class, we merge the market equity for all share classes before computing Cp. Firms with non-positive cash flows are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.2.13,"Cpq1, Cpq6, Cpq12","Quarterly Cash Flow-to-price (1-month holding period), Quarterly Cash Flow-to-price (6-month holding period), Quarterly Cash Flow-to-price (12-month holding period)","Lakonishok, Shleifer, and Vishny",1994,Value-versus-growth,,,,,,,"At the beginning of each month t, we split stocks into deciles based on quarterly cash flow-to-price, -Cpq, which is cash flows for the latest fiscal quarter ending at least four months ago divided by the market equity (from CRSP) at the end of month t - 1. Quarterly cash flows are income before extraordinary items (Compustat quarterly item IBQ) plus depreciation (item DPQ). For firms with more than one share class, we merge the market equity for all share classes before computing Cpq. Firms with non-positive cash flows are excluded. We calculate decile returns for the current month t (Epq1), from month t to t + 5 (Epq6), and from month t to t + 11 (Epq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Epq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Epq6 decile." -A.2.14,Dp(dy),Dividend yield,Litzenberger and Ramaswamy,1979,Value-versus-growth,"crsp_mom['dy'] = ttm12(series='mdivpay', df=crsp_mom)/crsp_mom['me']","crsp_mom['permno'] = crsp_mom['permno'].astype(int) -crsp_mom['ret'] = crsp_mom['ret'].fillna(0) -crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity -crsp_mom['retx'] = np.where(crsp_mom['me'].isnull(), 0, crsp_mom['retx']) -# dy -crsp_mom['me_l1'] = crsp_mom.groupby(['permno'])['me'].shift(1) -crsp_mom['retdy'] = crsp_mom['ret'] - crsp_mom['retx'] -crsp_mom['mdivpay'] = crsp_mom['retdy']*crsp_mom['me_l1'] -",1,0,0,0,"At the end of June of each year t, we sort stocks into deciles based on dividend yield, Dp, which is the total dividends paid out from July of year t - 1 to June of t divided by the market equity (from CRSP) at the end of June of t. We calculate monthly dividends as the begin-of-month market equity times the difference between returns with and without dividends. Monthly dividends are then accumulated from July of t - 1 to June of t. We exclude firms that do not pay dividends. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.2.16,Op and Nop, (Net) Payout Yield,"Richardson, and Roberts",2007,Value-versus-growth,"data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] -data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] )","# op -data_rawq['xintq0'] = np.where(data_rawq['xintq'].isnull(), 0, data_rawq['xintq']) -data_rawq['xsgaq0'] = np.where(data_rawq['xsgaq'].isnull(), 0, data_rawq['xsgaq']) -data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) -data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) -data_rawq['beq_l4'] = data_rawq.groupby(['permno'])['beq'].shift(4) - -#nop -data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] -* 'me' from rawa -data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] -",1,1,1,0,"Per Boudoukh, Michaely, Richardson, and Roberts (2007), total payouts are dividends on common stock (Compustat annual item DVC) plus repurchases. Repurchases are the total expenditure on the purchase of common and preferred stocks (item PRSTKC) plus any reduction (negative change over the prior year) in the value of the net number of preferred stocks outstanding (item PSTKRV). Net payouts equal total payouts minus equity issuances, which are the sale of common and preferred stock (item SSTK) minus any increase (positive change over the prior year) in the value of the net number of preferred stocks outstanding (item PSTKRV). At the end of June of each year t, we sort stocks into deciles based on total payouts (net payouts) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1 (Op and Nop, respectively). For firms with more than one share class, we merge the market equity for all share classes before computing Op and Nop. Firms with non-positive total payouts (zero net payouts) are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the data on total expenditure and the sale of common and preferred stocks start in 1971, the Op and Nop portfolios start in July 1972." -A.2.2,Bmj,Book-to-June-end market equity, Asness and Frazzini ,2013,Value-versus-growth,data_rawa['bmj'] = data_rawa['be_per'] / data_rawa['prc'] ,"# clean up csho -comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) -data_rawa['txditc'] = data_rawa['txditc'].fillna(0) -* 'ps' -data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] -data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) -data_rawa['be_per'] = data_rawa['be'] / data_rawa['csho'] -",0,1,0,0,"Following Asness and Frazzini (2013), at the end of June of each year t, we sort stocks into deciles based on Bmj, which is book equity per share for the fiscal year ending in calendar year t - 1 divided by share price (from CRSP) at the end of June of t. We adjust for any stock splits between the fiscal year end and the end of June. Book equity per share is book equity divided by the num- ber of shares outstanding (Compustat annual item CSHO). Following Davis, Fama, and French (2000), we measure book equity as stockholders' book equity, plus balance sheet deferred taxes and investment tax credit (item TXDITC) if available, minus the book value of preferred stock. Stockholders' equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders' equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depending on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.2.20,Em,Enterprise multiple,Loughran and Wellman,2011,Value-versus-growth,data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'],"* 'me' from rawa -data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che']",1,1,0,0,"Enterprise multiple, Em, is enterprise value divided by operating income before depreciation (Com- pustat annual item OIBDP). Enterprise value is the market equity plus the total debt (item DLC plus item DLTT) plus the book value of preferred stocks (item PSTKRV) minus cash and short- term investments (item CHE). At the end of June of each year t, we split stocks into deciles based on Em for the fiscal year ending in calendar year t-1. The Market equity (from CRSP) is measured at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Em. Firms with negative enterprise value or operating income before depreciation are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.2.21,"Emq1, Emq6","Quarterly Enterprise multiple (1-month holding period), Quarterly Enterprise multiple (6-month holding period)",Loughran and Wellman,2011,Value-versus-growth,,,,,,,"Emq, is enterprise value scaled by operating income before depreciation (Compustat quarterly item OIBDPQ). Enterprise value is the market equity plus total debt (item DLCQ plus item DLTTQ) plus the book value of preferred stocks (item PSTKQ) minus cash and short-term investments (item CHEQ). At the beginning of each month t, we split stocks into deciles on Emq for the latest fiscal quarter ending at least four months ago. The Market equity (from CRSP) is measured at the end of month t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Emq. Firms with negative enterprise value or operating income before depreciation are excluded. Monthly decile returns are calculated for the current month t (Emq1), from month t to t + 5 (Emq6), and from month t to t + 11 (Emq12), and the deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Emq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Emq6 decile. For sufficient data coverage, the EMq portfolios start in January 1975." -A.2.22,Sp,Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth,data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'],"data_rawq['saleq4'] = ttm4('saleq', data_rawq) -data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) -* 'me' from rawq",1,0,1,0,"At the end of June of each year t, we sort stocks into deciles based on sales-to-price, Sp, which is sales (Compustat annual item SALE) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Sp. Firms with non-positive sales are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.2.23,"Sp q 1, Sp q 6, and Sp q 12",Quarterly Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on quarterly sales-to-price, Spq, which is sales (Compustat quarterly item SALEQ) divided by the market equity at the end of month t - 1. Before 1972, we use quarterly sales from fiscal quarters ending at least four months prior to the portfolio formation. Starting from 1972, we use quarterly sales from the most recent quarterly earnings announcement dates (item RDQ). Sales are generally announced with earnings during quarterly earnings announcements (Jegadeesh and Livnat 2006). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent quarterly sales to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Firms with non- positive sales are excluded. For firms with more than one share class, we merge the market equity for all share classes before computing Spq. Monthly decile returns are calculated for the current month t (Spq1), from month t to t + 5 (Spq6), and from month t to t + 11 (Spq12), and the deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Spq6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Spq6 decile." -A.2.24,Ocp,Operating Cash Flow-to-price,"Desai, Rajgopal, and Venkatachalam",2004,Value-versus-growth,"data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] -data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] )","* 'me' from rawa -data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf'])",1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on operating cash flows-to-price, Ocp, which is operating cash flows for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. Operating cash flows are measured as funds from operation (Compustat annual item FOPT) minus change in working capital (item WCAP) prior to 1988, and then as net cash flows from operating activities (item OANCF) stating from 1988. For firms with more than one share class, we merge the market equity for all share classes before computing Ocp. Firms with non-positive operating cash flows are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t+1. Because the data on funds from operation start in 1971, the Ocp portfolios start in July 1972. -" -A.2.26,Ir,Intangible Return,Daniel and Titman,2006,Value-versus-growth," -#Regression and get ir -#First get unique datelist -datelist = data_rawa['jdate'].unique() -for date in datelist: - temp = data_rawa[data_rawa['jdate'] == date] - n_row = temp.shape[0] - index = temp.index - X = pd.DataFrame() - X['bm5'] = temp['bm5'] - X['rB'] = temp['rB'] - X['intercept'] = 1 - X = X[['intercept','rB','bm5']] - X = np.mat(X) - Y = np.mat(temp[['ret5']]) - #These are residuals on one date - res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) - #put residuals back into data_rawa - data_rawa.loc[index,'ir'] = res","* 'bm' from rawa -#ir -''' -#First calculate r(t-5,t). Then rb(t-5,t) and use Bm to perform linear regression and get residue -''' -#r(t-5,t):sum ret from t-5 to t (which is calendar year t-6 to t-1) -lag = pd.DataFrame() -for i in range(1,6): - lag['ret%s' % i] = data_rawa.groupby(['permno'])['ret'].shift(i) - -data_rawa['ret5'] = lag['ret1']+lag['ret2']+lag['ret3']+lag['ret4']+lag['ret5'] - -#bm_t-5 (bm of year t-5) -data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) - -#rB (five year log book return) -#Reference: jf_06 page8 by KENT DANIEL -data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] -",1,1,0,0,p77 -A.2.28,Ebp,Enterprise Book-to-price,"Penman, Richardson, and Tuna",2007,Value-versus-growth,data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['ber']) / (data_rawa['n_debt']+data_rawa['me']),"* 'me' from rawa -#Ebp -data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) -data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) -data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] -data_rawa['f_asse'] = data_rawa['che'] -# net debt : = 铿乶ancial liabilities - 铿乶ancial assets. -data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] -data_rawa['ber'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa']",1,1,0,0,"Following Penman, Richardson, and Tuna (2007), we measure enterprise book-to-price, Ebp, as the ratio of the book value of net operating assets (net debt plus book equity) to the market value of net operating assets (net debt plus market equity). Net Debt-to-price, Ndp, is the ratio of net debt to the market equity. Net debt is financial liabilities minus financial assets. We measure financial liabilities as the sum of long-term debt (Compustat annual item DLTT), debt in current liabilities (item DLC), carrying value of preferred stock (item PSTK), and preferred dividends in arrears (item DVPA, zero if missing), less preferred treasury stock (item TSTKP, zero if missing). We measure financial assets as cash and short-term investments (item CHE). Book equity is common equity (item CEQ) plus any preferred treasury stock (item TSTKP, zero if missing) less any pre- ferred dividends in arrears (item DVPA, zero if missing). Market equity is the number of common shares outstanding times share price (from CRSP). -At the end of June of each year t, we sort stocks into deciles based on Ebp, and separately, on Ndp, for the fiscal year ending in calendar year t - 1. Market equity is measured at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Ebp and Ndp. When forming the Ebp portfolios, we exclude firms with non-positive book or market value of net operating assets. For the Ndp portfolios, we exclude firms with non-positive net debt. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.2.3,Bmq12,Quarterly Book-to-market Equity (12-month holding period),"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth,,,,,,,p70 -A.2.9,Ep,Earnings-to-price,Basu,1983,Value-versus-growth,data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'],"data_rawq['ibq4'] = ttm4('ibq', data_rawq) -* 'me' from rawq -",1,0,1,0,"At the end of June of each year t, we split stocks into deciles based on earnings-to-price, Ep, which is income before extraordinary items (Compustat annual item IB) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before com- puting Ep. Firms with non-positive earnings are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.1,Aci,Abnormal Corporate Investment,"Titman, Wei, and Xie",2004,Investment,data_rawa['aci'] = data_rawa['ce']/ (data_rawa['ce1']+data_rawa['ce2']+data_rawa['ce3'])-1,"data_rawa['ce'] = data_rawa['capx'] / data_rawa['sale'] -data_rawa['ce1'] = data_rawa['ce'].shift(1) -data_rawa['ce2'] = data_rawa['ce'].shift(2) -data_rawa['ce3'] = data_rawa['ce'].shift(3)",0,1,0,0,"At the end of June of year t, we measure abnormal corporate investment, Aci, as Cet-1/[(Cet-2 + Cet-3 + Cet-4)/3] - 1, in which Cet-j is capital expenditure (Compustat annual item CAPX) scaled by sales (item SALE) for the fiscal year ending in calendar year t - j. The last three-year average capital expenditure is designed to project the benchmark investment in the portfolio formation year. We exclude firms with sales less than ten million dollars. At the end of June of each year t, we sort stocks into deciles based on Aci. Monthly decile returns are computed from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.10,Nsi,Net Stock Issues,Pontiff and Woodgate,2008,Investment,data_rawa['nsi'] = np.log(data_rawa['sps']/data_rawa['sps_l1']),"data_rawa['sps'] = data_rawa['csho'] * data_rawa['ajex'] -data_rawa['sps_l1'] = data_rawa.groupby('permno')['sps'].shift(1)",0,1,0,0,"At the end of June of year t, we measure net stock issues, Nsi, as the natural log of the ratio of the split-adjusted shares outstanding at the fiscal year ending in calendar year t-1 to the split-adjusted shares outstanding at the fiscal year ending in t-2. The split-adjusted shares outstanding is shares outstanding (Compustat annual item CSHO) times the adjustment factor (item AJEX). At the end of June of each year t, we sort stocks with negative Nsi into two portfolios (1 and 2), stocks with zero Nsi into one portfolio (3), and stocks with positive Nsi into seven portfolios (4 to 10). Monthly decile returns are from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." -A.3.11,dIi,% Change in Investment - % Change in Industry Investment,Abarbanell and Bushee,1998,Investment,data_rawa['dIi'] = data_rawa['dinvt'] - data_rawa['dind'],"data_rawa['e_invt'] = (data_rawa['capxv'] + data_rawa['capxv'].shift(1))/2 -data_rawa['dinvt'] = (data_rawa['capxv'] - data_rawa['e_invt']) / data_rawa['e_invt'] - -data_rawa['ind'] = data_rawa['capxv'] -s = data_rawa.groupby(['jdate', 'sic2'])['ind'].sum() -data_rawa = pd.merge(data_rawa, s, on=['jdate', 'sic2']) -# new industry investment will be named as ind_y, cause it's been grouped by ind -data_rawa['e_ind'] = (data_rawa['ind_y'] + data_rawa['ind_y'].shift(1))/2 -data_rawa['dind'] = (data_rawa['ind_y']-data_rawa['e_ind']) / data_rawa['e_ind']",0,1,0,0,"Following Abarbanell and Bushee (1998), we define the %d(.) operator as the percentage change in the variable in the parentheses from its average over the prior two years, e.g., %d(Investment) = [Investment(t) - E[Investment(t)]]/E[Investment(t)], in which E[Investment(t)] = [Investment(t-1) + Investment(t - 2)]/2. dIi is defined as %d(Investment) - %d(Industry investment), in which investment is capital expenditure in property, plant, and equipment (Compustat annual item CAPXV). Industry investment is the aggregate investment across all firms with the same two- digit SIC code. Firms with non-positive E[Investment(t)] are excluded and we require at least two firms in each industry. At the end of June of each year t, we sort stocks into deciles based on dIi for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." -A.3.14,Ivg,Inventory Growth,Belo and Lin,2011,Investment,,,,,,,"At the end of June of each year t, we sort stocks into deciles based on inventory growth, Ivg, which is the annual growth rate in inventory (Compustat annual item INVT) from the fiscal year ending in calendar year t - 2 to the fiscal year ending in t - 1. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." -A.3.15,Ivc,Inventory Changes,Thomas and Zhang,2002,Investment,data_rawa['ivc'] = data_rawa['invt'] / data_rawa['atAvg'],"data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1) -data_rawa['atAvg'] = (data_rawa['at']+data_rawa['at_l1'])/2",1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on inventory changes, Ivc, which is the annual change in inventory (Compustat annual item INVT) scaled by the average of total assets (item AT) for the fiscal years ending in t - 2 and t - 1. We exclude firms that carry no inventory for the past two fiscal years. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.16,Oa(acc),Operating Accruals,Sloan,1996,Investment,"data_rawq['acc'] = np.select(condlist, choicelist, - default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])- - (data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/(10*data_rawq['beq'])) -","#prepare be -data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) -data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) -# acc -data_rawq['actq_l4'] = data_rawq.groupby(['permno'])['actq'].shift(4) -data_rawq['lctq_l4'] = data_rawq.groupby(['permno'])['lctq'].shift(4) -data_rawq['npq_l4'] = data_rawq.groupby(['permno'])['npq'].shift(4) -condlist = [data_rawq['npq'].isnull(), - data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] -choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/(10*data_rawq['beq']), - np.nan]",0,0,1,0,"Prior to 1988, we use the balance sheet approach in Sloan (1996) to measure operating accruals, Oa, as changes in noncash working capital minus depreciation, in which the noncash working capital is changes in noncash current assets minus changes in current liabilities less short-term debt and taxes payable. In particular, Oa equals (dCA-dCASH)-(dCL-dSTD-dTP)-DP, in which dCA is the change in current assets (Compustat annual item ACT), dCASH is the change in cash or cash equiv- alents (item CHE), dCL is the change in current liabilities (item LCT), dSTD is the change in debt included in current liabilities (item DLC), dTP is the change in income taxes payable (item TXP), and DP is depreciation and amortization (item DP). Missing changes in income taxes payable are set to zero. Starting from 1988, we follow Hribar and Collins (2002) to measure Oa using the state- ment of cash flows as net income (item NI) minus net cash flow from operations (item OANCF). Doing so helps mitigate measurement errors that can arise from nonoperating activities such as ac- quisitions and divestitures. Data from the statement of cash flows are only available since 1988. At the end of June of each year t, we sort stocks into deciles on Oa for the fiscal year ending in calendar year t - 1 scaled by total assets (item AT) for the fiscal year ending in t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. -" -A.3.17,Ta,Total Accruals,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['ta'] = data_rawa['dwc'] + data_rawa['dnco'] + data_rawa['dfin'],"#dwc -data_rawa['dwc'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc']) -* dnco -* dfin -",0,1,0,0,"Prior to 1988, we use the balance sheet approach in Richardson, Sloan, Soliman, and Tuna (2005) to measure total accruals, Ta, as dWc + dNco + dFin. dWc is the change in net non-cash working capital. Net non-cash working capital is current operating asset (Coa) minus current operating liabilities (Col), with Coa = current assets (Compustat annual item ACT) - cash and short-term investments (item CHE) and Col = current liabilities (item LCT) - debt in current liabilities (item DLC). dNco is the change in net non-current operating assets. Net non-current operating assets are non-current operating assets (Nca) minus non-current operating liabilities (Ncl), with Nca = total assets (item AT) - current assets - long-term investments (item IVAO), and Ncl = total liabilities (item LT) - current liabilities - long-term debt (item DLTT). dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (item IVST) + long-term investments, and Fnl = long-term debt + debt in current liabilities + preferred stocks (item PSTK). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero. -Starting from 1988, we use the cash flow approach to measure Ta as net income (item NI) minus total operating, investing, and financing cash flows (items OANCF, IVNCF, and FINCF) plus sales of stocks (item SSTK, zero if missing) minus stock repurchases and dividends (items PRSTKC and DV, zero if missing). Data from the statement of cash flows are only available since 1988. At the end of June of each year t, we sort stocks into deciles based on Ta for the fiscal year ending in calendar year t - 1 scaled by total assets for the fiscal year ending in t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.18,dCoa,changes in Current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dcoa'] = (data_rawa['coa']-data_rawa['coa'].shift(1)) / data_rawa['at'].shift(1),"# dCoa -data_rawa['coa'] = data_rawa['act'] - data_rawa['che']",0,1,0,0,"Richardson, Sloan, Soliman, and Tuna (2005, Table 10) show that several components of total accruals also forecast returns in the cross section. dWc is the change in net non-cash working capital. Net non-cash working capital is current operating asset (Coa) minus current operating liabilities (Col), with Coa = current assets (Compustat annual item ACT) - cash and short term investments (item CHE) and Col = current liabilities (item LCT) - debt in current liabilities (item DLC). dCoa is the change in current operating asset and dCol is the change in current operating liabilities. Missing changes in debt in current liabilities are set to zero. At the end of June of each year t, we sort stocks into deciles based, separately, on dWc, dCoa, and dCol for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.19,dNca,changes in Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dnca'] = data_rawa['nco'] - data_rawa['nco'].shift(1),"# dNca -data_rawa['ivao_0'] = np.where(data_rawa['ivao'].isnull(), 0, data_rawa['ivao']) -data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) -data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao_0'] -data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt_0'] -data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] -",0,1,0,0,"dNco is the change in net non-current operating assets. Net non-current operating assets are non- current operating assets (Nca) minus non-current operating liabilities (Ncl), with Nca = total assets (Compustat annual item AT) - current assets (item ACT) - long-term investments (item IVAO), and Ncl = total liabilities (item LT) - current liabilities (item LCT) - long-term debt (item DLTT). dNca is the change in non-current operating assets and dNcl is the change in non-current operating liabilities. Missing changes in long-term investments and long-term debt are set to zero. At the end of June of each year t, we sort stocks into deciles based, separately, on dNco, dNca, and dNcl for the fiscal year ending in calendar year t - 1, all scaled by total assets for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.19,dNco,Changes in Net Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dnco'] = data_rawa['nco'] - data_rawa['nco'].shift(1),"# dNco -data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao'] -data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt'] -data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl']",0,1,0,0,"dNco is the change in net non-current operating assets. Net non-current operating assets are non- current operating assets (Nca) minus non-current operating liabilities (Ncl), with Nca = total assets (Compustat annual item AT) - current assets (item ACT) - long-term investments (item IVAO), and Ncl = total liabilities (item LT) - current liabilities (item LCT) - long-term debt (item DLTT). dNca is the change in non-current operating assets and dNcl is the change in non-current operating liabilities. Missing changes in long-term investments and long-term debt are set to zero. At the end of June of each year t, we sort stocks into deciles based, separately, on dNco, dNca, and dNcl for the fiscal year ending in calendar year t - 1, all scaled by total assets for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.2,I/A, Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment,data_rawa['ia'] = (data_rawa['at']/data_rawa['at_l1'])-1,data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1),1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on investment-to-assets, I/A, which is measured as total assets (Compustat annual item AT) for the fiscal year ending in calendar year t-1 divided by total assets for the fiscal year ending in t-2 minus one. Monthly decile returns are computed from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.20,dBe,Changes in Net Financial Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dBe'] = (data_rawa['ceq'] - data_rawa['ceq'].shift(1)) / data_rawa['at'].shift(1),,0,1,0,0,"dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (Compustat annual item IVST) + long-term investments (item IVAO), and Fnl = long-term debt (item DLTT) + debt in current liabilities (item DLC) + preferred stock (item PSTK). dSti is the change in short-term investments, dLti is the change in long-term investments, and dFnl is the change in financial liabilities. dBe is the change in book equity (item CEQ). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero (at least one change has to be non-missing when constructing any variable). When constructing dSti (dLti), we exclude firms that do not have long-term (short-term) investments in the past two fiscal years. At the end of June of each year t, we sort stocks into deciles based, separately, on dFin, dSti, dLti, dFnl, and dBe for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." -A.3.20,dFin,changes in Financial Liabilities,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dfin'] = data_rawa['dfin'] / data_rawa['at'].shift(1),"data_rawa['fna'] = data_rawa['ivst'] + data_rawa['ivao'] -data_rawa['fnl'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] - -data_rawa['d_dlc'] = data_rawa['dlc'] - data_rawa['dlc'].shift(1) -data_rawa['d_dlc'] = np.where(data_rawa['d_dlc'].isnull(), 0, data_rawa['d_dlc']) -data_rawa['d_pstk'] = data_rawa['pstk'] - data_rawa['pstk'].shift(1) -data_rawa['d_pstk'] = np.where(data_rawa['d_pstk'].isnull(), 0, data_rawa['d_pstk']) - -data_rawa['dfnl'] = (data_rawa['dltt']-data_rawa['dltt'].shift(1)) + data_rawa['d_dlc'] + data_rawa['d_pstk'] - -data_rawa['d_ivst'] = data_rawa['ivst'] - data_rawa['ivst'].shift(1) -data_rawa['d_ivst'] = np.where(data_rawa['d_ivst'].isnull(), 0, data_rawa['d_ivst']) -data_rawa['d_ivao'] = data_rawa['ivao'] - data_rawa['ivao'].shift(1) -data_rawa['d_ivao'] = np.where(data_rawa['d_ivao'].isnull(), 0, data_rawa['d_ivao']) - -data_rawa['dfna'] = data_rawa['d_ivst'] + data_rawa['d_ivao'] -data_rawa['dfin'] = data_rawa['dfna'] - data_rawa['dfnl']",0,1,0,0,"dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (Compustat annual item IVST) + long-term investments (item IVAO), and Fnl = long-term debt (item DLTT) + debt in current liabilities (item DLC) + preferred stock (item PSTK). dSti is the change in short-term investments, dLti is the change in long-term investments, and dFnl is the change in financial liabilities. dBe is the change in book equity (item CEQ). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero (at least one change has to be non-missing when constructing any variable). When constructing dSti (dLti), we exclude firms that do not have long-term (short-term) investments in the past two fiscal years. At the end of June of each year t, we sort stocks into deciles based, separately, on dFin, dSti, dLti, dFnl, and dBe for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." -A.3.20,dFnl,changes in Book Equity,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dfnl'] = data_rawa['dfnl'] / data_rawa['at'].shift(1),* dfnl in dFin,0,1,0,0,"dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (Compustat annual item IVST) + long-term investments (item IVAO), and Fnl = long-term debt (item DLTT) + debt in current liabilities (item DLC) + preferred stock (item PSTK). dSti is the change in short-term investments, dLti is the change in long-term investments, and dFnl is the change in financial liabilities. dBe is the change in book equity (item CEQ). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero (at least one change has to be non-missing when constructing any variable). When constructing dSti (dLti), we exclude firms that do not have long-term (short-term) investments in the past two fiscal years. At the end of June of each year t, we sort stocks into deciles based, separately, on dFin, dSti, dLti, dFnl, and dBe for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." -A.3.22,Poa,Percent operating accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment,data_rawa['poa'] = data_rawa['oa']/data_rawa['ni'],* oa(acc),0,1,0,0,"Accruals are traditionally scaled by total assets. Hafzalla, Lundholm, and Van Winkle (2011) show that scaling accruals by the absolute value of earnings (percent accruals) is more effective in se- lecting firms for which the differences between sophisticated and naive forecasts of earnings are the most extreme. To construct the percent operating accruals (Poa) deciles, at the end of June of each year t, we sort stocks into deciles based on operating accruals scaled by the absolute value of net income (Compustat annual item NI) for the fiscal year ending in calendar year t - 1. See Appendix A.3.16 for the measurement of operating accruals. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." -A.3.23,Pta,Percent total accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment,,,,,,,"At the end of June of each year t, we sort stocks into deciles on percent total accruals, Pta, cal- culated as total accruals scaled by the absolute value of net income (Compustat annual item NI) for the fiscal year ending in calendar year t - 1. See Appendix A.3.17 for the measurement of total accruals. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of year t + 1." -A.3.24,Pda,Percent discretionary accruals,,,Investment,,,,,,,"At the end of June of each year t, we split stocks into deciles based on percent discretionary accruals, Pda, calculated as the discretionary accruals, Dac, for the fiscal year ending in calendar year t - 1 multiplied with total assets (Compustat annual item AT) for the fiscal year ending in t - 2 scaled by the absolute value of net income (item NI) for the fiscal year ending in t - 1. See Appendix A.3.21 for the measurement of discretionary accruals. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." -A.3.25,Ndf,Net debt finance,"Bradshaw, Richardson, and Sloan",2006,Investment,data_rawa['ndf'] = data_rawa['dltis'] - data_rawa['dltr'] + data_rawa['dlcch'] ,,0,1,0,0,"Net external financing, Nxf, is the sum of net equity financing, Nef, and net debt financing, Ndf (Bradshaw, Richardson, and Sloan 2006). Nef is the proceeds from the sale of common and pre- ferred stocks (Compustat annual item SSTK) less cash payments for the repurchases of common and preferred stocks (item PRSTKC) less cash payments for dividends (item DV). Ndf is the cash proceeds from the issuance of long-term debt (item DLTIS) less cash payments for long-term debt reductions (item DLTR) plus the net changes in current debt (item DLCCH, zero if missing). At the end of June of each year t, we sort stocks into deciles based on Nxf, and, separately, on Nef and Ndf, for the fiscal year ending in calendar year t - 1 scaled by the average of total assets for fiscal years ending in t - 2 and t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the data on financing activities start in 1971, the portfolios start in July 1972." -A.3.3,"Ia q 6, and Ia q 12",Quarterly Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment,data_rawq['iaq'] = (data_rawq['atq']/data_rawq['atqlag'])-1,"data_rawq['atqlag'] = ttm4('atq',data_rawq)",0,0,1,0,"Quarterly investment-to-assets, Iaq, is defined as quarterly total assets (Compustat quarterly item ATQ) divided by four-quarter-lagged total assets minus one. At the beginning of each month t, we sort stocks into deciles based on Iaq for the latest fiscal quarter ending at least four months ago. Monthly decile returns are calculated for the current month t (Iaq1), from month t to t + 5 (Iaq6), and from month t to t + 11 (Iaq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Iaq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Iaq6 decile." -A.3.4,dPia,Changes in PPE and Inventory-to-assets,"Lyandres, Sun, and Zhang",2008,Investment,data_rawa['dpia'] = (data_rawa['c_propty'] + data_rawa['c_invt']) / data_rawa['at'].shift(1),"data_rawa['c_propty'] = data_rawa['ppegt'] - data_rawa['ppegt'].shift(1) -data_rawa['c_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1)",0,1,0,0,"Changes in PPE and Inventory-to-assets, dPia, is defined as the annual change in gross property, plant, and equipment (Compustat annual item PPEGT) plus the annual change in inventory (item INVT) scaled by one-year-lagged total assets (item AT). At the end of June of each year t, we sort stocks into deciles based on dPia for the fiscal year ending in calendar year t-1. Monthly decile re- turns are computed from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." -A.3.5,Noa and dNoa,(Changes in) Net Operating Assets,"Hirshleifer, Hou, Teoh, and Zhang",2004,Investment,"data_rawq['noa'] = (data_rawq['atq']-data_rawq['cheq']-data_rawq['ivaoq'])-\ - (data_rawq['atq']-data_rawq['dlcq']-data_rawq['dlttq']-data_rawq['mibq']-data_rawq['pstkq']-data_rawq['ceqq'])/data_rawq['atq_l4'] -data_rawa['dnoa'] = (data_rawa['net_op']-data_rawa['net_op'].shift(1))/ data_rawa['at'].shift(1) -","#noa -data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) -data_rawq['ivaoq'] = np.where(data_rawq['ivaoq'].isnull(), 0, 1) -data_rawq['dlcq'] = np.where(data_rawq['dlcq'].isnull(), 0, 1) -data_rawq['dlttq'] = np.where(data_rawq['dlttq'].isnull(), 0, 1) -data_rawq['mibq'] = np.where(data_rawq['mibq'].isnull(), 0, 1) -data_rawq['pstkq'] = np.where(data_rawq['pstkq'].isnull(), 0, 1) -# dNoa -data_rawa['dlc_0'] = np.where(data_rawa['dlc'].isnull(), 0, data_rawa['dlc']) -data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) -data_rawa['mib_0'] = np.where(data_rawa['mib'].isnull(), 0, data_rawa['mib']) -data_rawa['pstk_0'] = np.where(data_rawa['pstk'].isnull(), 0, data_rawa['pstk']) - -data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] -data_rawa['op_lia'] = data_rawa['at'] - data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] -data_rawa['net_op'] = data_rawa['op_at'] - data_rawa['op_lia']",1,1,1,0,"Following Hirshleifer, Hou, Teoh, and Zhang (2004), we measure net operating assets as operating assets minus operating liabilities. Operating assets are total assets (Compustat annual item AT) minus cash and short-term investment (item CHE). Operating liabilities are total assets minus debt included in current liabilities (item DLC, zero if missing), minus long-term debt (item DLTT, zero if missing), minus minority interests (item MIB, zero if missing), minus preferred stocks (item PSTK, zero if missing), and minus common equity (item CEQ). Noa is net operating assets scalded by one-year-lagged total assets. Changes in net operating assets, dNoa, is the annual change in net operating assets scaled by one-year-lagged total assets. At the end of June of each year t, we sort stocks into deciles based on Noa, and separately, on dNOA, for the fiscal year ending in calendar year t - 1. Monthly decile returns are computed from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.6,dLno,Changes in Long-term Net Operating Assets,"Fairfield, Whisenant, and Yohn",2003,Investment,"data_rawa['dlno'] = (data_rawa['ppent']-data_rawa['ppent'].shift(1)) + (data_rawa['intan']-data_rawa['intan'].shift(1)) + (data_rawa['ao']-data_rawa['ao'].shift(1)) - (data_rawa['lo']-data_rawa['lo'].shift(1)) + data_rawa['dp'] -* -data_rawa['dlno'] = data_rawa['dlno'] / data_rawa['avg_at']","* -avg_at = [] -for i in range(data_rawa.shape[0]): - avg_at.append(data_rawa.loc[0:i, 'at'].mean()) -data_rawa['avg_at'] = pd.DataFrame(avg_at)",0,1,0,0,"Following Fairfield, Whisenant, and Yohn (2003), we measure changes in long-term net operating assets as the annual change in net property, plant, and equipment (Compustat item PPENT) plus the change in intangibles (item INTAN) plus the change in other long-term assets (item AO) minus the change in other long-term liabilities (item LO) and plus depreciation and amortization expense (item DP). dLno is the change in long-term net operating assets scaled by the average of total assets (item AT) from the current and prior years. At the end of June of each year t, we sort stocks into deciles based on dLno for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.7,Ig,Investment Growth,Xing,2008,Investment,data_rawa['ig'] = data_rawa['capx']/data_rawa['capx_l1'],data_rawa['capx_l1'] = data_rawa.groupby('permno')['capx'].shift(1),1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on investment growth, Ig, which is the growth rate in capital expenditure (Compustat annual item CAPX) from the fiscal year ending in calendar year t - 2 to the fiscal year ending in t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.3.8,2Ig,2-year Investment Growth,Anderson and Garcia-Feijoo,2006,Investment,data_rawa['2ig'] = data_rawa['capx']/data_rawa['capx_l2'],data_rawa['capx_l2'] = data_rawa.groupby('permno')['capx'].shift(2),0,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on two-year investment growth, 2Ig, which is the growth rate in capital expenditure (Compustat annual item CAPX) from the fiscal year ending in calendar year t - 3 to the fiscal year ending in t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.4.1,"Roe1, Roe6",Return on Equity,"Hou, Xue, and Zhang",2015,Profitability,data_rawq['roe'] = data_rawq['ibq']/data_rawq['ceqq_l1'],data_rawq['ceqq_l1'] = data_rawq.groupby(['permno'])['ceqq'].shift(1),1,0,1,0,"Return on equity, Roe, is income before extraordinary items (Compustat quarterly item IBQ) di- vided by one-quarter-lagged book equity (Hou, Xue, and Zhang 2015). Book equity is shareholders' equity, plus balance sheet deferred taxes and investment tax credit (item TXDITCQ) if available, minus the book value of preferred stock (item PSTKQ). Depending on availability, we use stockhold- ers' equity (item SEQQ), or common equity (item CEQQ) plus the book value of preferred stock, or total assets (item ATQ) minus total liabilities (item LTQ) in that order as shareholders' equity." -A.4.11,"Gla q 1, Gla q 6, and Gla q 12",Quarterly Gross Profits-to-lagged Assets,,,Profitability,,,,,,,"Glaq, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ) divided by one-quarter-lagged total assets (item ATQ). At the beginning of each month t, we sort stocks into deciles based on Glaq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Glaq1), from month t to t+5 (Glaq6), and from month t to t + 11 (Glaq12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Glaq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Glaq6 decile. For sufficient data coverage, the Glaq portfolios start in January 1976." -A.4.12,Ope(operprof),Operating Profits to Equity,Fama and French,2015,Profitability,data_rawa['operprof'] = (data_rawa['revt']-data_rawa['cogs']-data_rawa['xsga0']-data_rawa['xint0'])/data_rawa['ceq_l1'],"data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) -data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) -data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga'])",0,1,0,0,"Following Fama and French (2015), we measure operating profitability to equity, Ope, as total rev- enue (Compustat annual item REVT) minus cost of goods sold (item COGS, zero if missing), minus selling, general, and administrative expenses (item XSGA, zero if missing), and minus interest ex- pense (item XINT, zero if missing), scaled by book equity (the denominator is current, not lagged, book equity). We require at least one of the three expense items (COGS, XSGA, and XINT) to be non-missing. Book equity is stockholders' book equity, plus balance sheet deferred taxes and investment tax credit (item TXDITC) if available, minus the book value of preferred stock. Stock- holders' equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders' equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depend- ing on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock. At the end of June of each year t, we sort stocks into deciles based on Ope for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.4.14,"Ole q 1, Ole q 6 ",Quarterly Operating Profits-to-lagged Equity,,,Profitability,,,,,,,"Quarterly operating profits-to-lagged equity, Oleq, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ, zero if missing), minus selling, general, and administrative expenses (item XSGAQ, zero if missing), and minus interest expense (item XINTQ, zero if missing), scaled by one-quarter-lagged book equity. We require at least one of the three expense items (COGSQ, XSGAQ, and XINTQ) to be non-missing. Book equity is shareholders' equity, plus balance sheet deferred taxes and investment tax credit (item TXDITCQ) if available, minus the book value of preferred stock (item PSTKQ). Depending on availability, we use stockhold- ers' equity (item SEQQ), or common equity (item CEQQ) plus the book value of preferred stock, or total assets (item ATQ) minus total liabilities (item LTQ) in that order as shareholders' equity. -At the beginning of each month t, we split stocks on Oleq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Oleq 1), from month t to t + 5 (Oleq6), and from month t to t + 11 (Oleq12). The deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Oleq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Oleq6 decile. For sufficient data coverage, the Oleq portfolios start in January 1972." -A.4.15,Opa,Operating Profits-to-assets,"Linnainmaa, and Nikolaev",2015,Profitability,,,,,,,"Following Ball, Gerakos, Linnainmaa, and Nikolaev (2015), we measure operating profits-to-assets, Opa, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS), minus selling, general, and administrative expenses (item XSGA), and plus research and develop- ment expenditures (item XRD, zero if missing), scaled by book assets (item AT, the denominator is current, not lagged, total assets). At the end of June of each year t, we sort stocks into deciles based on Opa for the fiscal year ending in calendar year t-1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.4.17,"Ola q 1, Ola q 6, and Ola q 12",Quarterly Operating Profits-to-lagged Assets,,,Profitability,,,,,,,"Quarterly operating profits-to-lagged assets, Olaq, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ), minus selling, general, and administra- tive expenses (item XSGAQ), plus research and development expenditures (item XRDQ, zero if missing), scaled by one-quarter-lagged book assets (item ATQ). At the beginning of each month t, we sort stocks into deciles based on Olaq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Olaq1), from month t to t+5 (Olaq6), and from month t to t + 11 (Olaq12). The deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Olaq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Olaq6 decile. For sufficient data coverage, the Olaq portfolios start in January 1976." -A.4.18,Cop,Cash-based Operating Profitability,"Gerakos, Linnainmaa, and Nikolaev",2016,Profitability,"data_rawa['cop'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ - - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ - + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] -data_rawa['cop'] = data_rawa['cop'] / data_rawa['at'] ",* Cla,0,1,0,0,"Following Ball, Gerakos, Linnainmaa, and Nikolaev (2016), we measure cash-based operating prof- itability, Cop, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS), minus selling, general, and administrative expenses (item XSGA), plus research and de- velopment expenditures (item XRD, zero if missing), minus change in accounts receivable (item RECT), minus change in inventory (item INVT), minus change in prepaid expenses (item XPP), plus change in deferred revenue (item DRC plus item DRLT), plus change in trade accounts payable (item AP), and plus change in accrued expenses (item XACC), all scaled by book assets (item AT, the denominator is current, not lagged, total assets). All changes are annual changes in balance sheet items and we set missing changes to zero. At the end of June of each year t, we sort stocks into deciles based on Cop for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.4.19,Cla,Cash-based Operating Profits-to-lagged Assets,,,Profitability,"data_rawa['cla'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ - - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ - + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] -data_rawa['cla'] = data_rawa['cla'] / data_rawa['at'].shift(1)","data_rawa['d_rect'] = data_rawa['rect'] - data_rawa['rect'].shift(1) -data_rawa['d_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) -data_rawa['d_xpp'] = data_rawa['xpp'] - data_rawa['xpp'].shift(1) -data_rawa['d_dr'] = (data_rawa['drc']-data_rawa['drc'].shift(1)) + (data_rawa['drlt']-data_rawa['drlt'].shift(1)) -data_rawa['d_ap'] = data_rawa['ap'] - data_rawa['ap'].shift(1) -data_rawa['d_xacc'] = data_rawa['xacc'] - data_rawa['xacc'].shift(1) - -data_rawa['xrd_0'] = np.where(data_rawa['xrd'].isnull(), 0, data_rawa['xrd']) -data_rawa['d_rect_0'] = np.where(data_rawa['d_rect'].isnull(), 0, data_rawa['d_rect']) -data_rawa['d_invt_0'] = np.where(data_rawa['d_invt'].isnull(), 0, data_rawa['d_invt']) -data_rawa['d_xpp_0'] = np.where(data_rawa['d_xpp'].isnull(), 0, data_rawa['d_xpp']) -data_rawa['d_dr_0'] = np.where(data_rawa['d_dr'].isnull(), 0, data_rawa['d_dr']) -data_rawa['d_ap_0'] = np.where(data_rawa['d_ap'].isnull(), 0, data_rawa['d_ap']) -data_rawa['d_xacc_0'] = np.where(data_rawa['d_xacc'].isnull(), 0, data_rawa['d_xacc'])",0,1,0,0,"Cash-based operating profits-to-lagged assets, Cla, is total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS), minus selling, general, and administrative expenses (item XSGA), plus research and development expenditures (item XRD, zero if missing), minus change in accounts receivable (item RECT), minus change in inventory (item INVT), minus change in prepaid expenses (item XPP), plus change in deferred revenue (item DRC plus item DRLT), plus change in trade accounts payable (item AP), and plus change in accrued expenses (item XACC), all scaled by one-year-lagged book assets (item AT). All changes are annual changes in balance sheet items and we set missing changes to zero. At the end of June of each year t, we sort stocks into deciles based on Cla for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.4.2,"dRoe1, dRoe6, and dRoe12",Changes in Return on Equity,"Hou, Xue, and Zhang",2015,Profitability,,,,,,,"Change in return on equity, dRoe, is return on equity minus its value from four quarters ago. See Appendix A.4.1 for the measurement of return on equity. At the beginning of each month t, we sort all stocks into deciles on their most recent past dRoe. Before 1972, we use the most recent dRoe with quarterly earnings from fiscal quarters ending at least four months ago. Starting from 1972, we use dRoe computed with quarterly earnings from the most recent quarterly earnings announcement dates (Compustat quarterly item RDQ). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent dRoe to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for the current month t (dRoe1), from month t to t + 5 (dRoe6), and from month t to t + 11 (dRoe12). The deciles are rebalanced monthly. The holding period that is longer than one month as in, for instance, dRoe6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdeciles returns as the monthly return of the dRoe6 decile." -A.4.20,Claq,Quarterly Cash-based Operating Profits-to-lagged Assets,,,Profitability,,,,,,,"Quarterly cash-based operating profits-to-lagged assets, Cla, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ), minus selling, general, and ad- ministrative expenses (item XSGAQ), plus research and development expenditures (item XRDQ, zero if missing), minus change in accounts receivable (item RECTQ), minus change in inventory (item INVTQ), plus change in deferred revenue (item DRCQ plus item DRLTQ), and plus change in trade accounts payable (item APQ), all scaled by one-quarter-lagged book assets (item ATQ). All changes are quarterly changes in balance sheet items and we set missing changes to zero. At the beginning of each month t, we split stocks on Claq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Claq1), from month t to t + 5 (Claq6), and from month t to t + 11 (Claq12). The deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Claq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Claq6 decile. For sufficient data coverage, the Claq portfolios start in January 1976." -A.4.29,Tbi q 12,Quarterly Taxable Income-to-book Income,"Green, Hand, and Zhang",2013,Profitability,,,,,,,"Quarterly taxable income-to-book income, Tbiq, is quarterly pretax income (Compustat quarterly item PIQ) divided by net income (NIQ). At the beginning of each month t, we split stocks into deciles based on Tbiq calculated with accounting data from the fiscal quarter ending at least four months ago. We exclude firms with non-positive pretax income or net income. We calculate monthly decile returns for the current month t (Tbiq1), from month t to t + 5 (Tbiq6), and from month t to t+11 (Tbiq12). The deciles are rebalanced at the beginning of month t+1. The holding period that is longer than one month as in, for instance, Tbiq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Tbiq6 decile." -A.4.3,Roa1,Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability,data_rawq['roa'] = data_rawq['ibq']/data_rawq['atq_l1'],data_rawq['atq_l1'] = data_rawq.groupby(['permno'])['atq'].shift(1),1,0,1,0,"Return on assets, Roa, is income before extraordinary items (Compustat quarterly item IBQ) di- vided by one-quarter-lagged total assets (item ATQ). At the beginning of each month t, we sort all stocks into deciles based on Roa computed with quarterly earnings from the most recent earnings announcement dates (item RDQ). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent Roa to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corre- sponding fiscal quarter end. Monthly decile returns are calculated for month t (Roa1), from month t to t+5 (Roe6), and from month t to t+11 (Roe12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Roa6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdeciles returns as the monthly return of the Roa6 decile. For sufficient data coverage, the Roa portfolios start in January 1972." -A.4.4,"dRoa1, dRoa6",Changes in Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability,,,,,,,"Change in return on assets, dRoa, is return on assets minus its value from four quarters ago. See Appendix A.4.3 for the measurement of return on assets. At the beginning of each month t, we sort all stocks into deciles based on dRoa computed with quarterly earnings from the most recent earnings announcement dates (Compustat quarterly item RDQ). For a firm to enter the portfo- lio formation, we require the end of the fiscal quarter that corresponds to its most recent dRoa to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for month t (dRoa1), from month t to t + 5 (dRoa6), and from month t to t + 11 (dRoa12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, dRoa6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the dRoa6 decile. For sufficient data coverage, the dRoa portfolios start in January 1973." -A.4.5,Ato,Asset Turnover,Soliman,2008,Profitability,data_rawq['ato'] = data_rawq['saleq']/data_rawq['noa_l4'],"* noa -* noa_l4 from rna",1,0,1,0,"Soliman (2008) use DuPont analysis to decompose Roe as Rna + FLEV * SPREAD, in which Roe is return on equity, Rna is return on net operating assets, FLEV is financial leverage, and SPREAD is the difference between return on net operating assets and borrowing costs. We can further decompose Rna as Pm * Ato, in which Pm is profit margin and Ato is asset turnover. -Following Soliman (2008), we use annual sorts to form Rna, Pm, and Ato deciles. At the end of June of year t, we measure Rna as operating income after depreciation (Compustat annual item OIADP) for the fiscal year ending in calendar year t - 1 divided by net operating assets (Noa) for the fiscal year ending in t - 2. Noa is operating assets minus operating liabilities. Operating assets are total assets (item AT) minus cash and short-term investment (item CHE), and minus other investment and advances (item IVAO, zero if missing). Operating liabilities are total assets minus debt in current liabilities (item DLC, zero if missing), minus long-term debt (item DLTT, zero if missing), minus minority interests (item MIB, zero if missing), minus preferred stocks (item PSTK, zero if missing), and minus common equity (item CEQ). Pm is operating income after depreciation divided by sales (item SALE) for the fiscal year ending in calendar year t - 1. Ato is sales for the fiscal year ending in calendar year t - 1 divided by Noa for the fiscal year ending in t - 2. At the end of June of each year t, we sort stocks into three sets of deciles based on Rna, Pm, and Ato. We exclude firms with non-positive Noa for the fiscal year ending in calendar year t - 2 when forming the Rna and the Ato portfolios. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.4.5,pm,profit margin,Soliman,2008,Profitability,,,,,,,"Soliman (2008) use DuPont analysis to decompose Roe as Rna + FLEV * SPREAD, in which Roe is return on equity, Rna is return on net operating assets, FLEV is financial leverage, and SPREAD is the difference between return on net operating assets and borrowing costs. We can further decompose Rna as Pm * Ato, in which Pm is profit margin and Ato is asset turnover. -Following Soliman (2008), we use annual sorts to form Rna, Pm, and Ato deciles. At the end of June of year t, we measure Rna as operating income after depreciation (Compustat annual item OIADP) for the fiscal year ending in calendar year t - 1 divided by net operating assets (Noa) for the fiscal year ending in t - 2. Noa is operating assets minus operating liabilities. Operating assets are total assets (item AT) minus cash and short-term investment (item CHE), and minus other investment and advances (item IVAO, zero if missing). Operating liabilities are total assets minus debt in current liabilities (item DLC, zero if missing), minus long-term debt (item DLTT, zero if missing), minus minority interests (item MIB, zero if missing), minus preferred stocks (item PSTK, zero if missing), and minus common equity (item CEQ). Pm is operating income after depreciation divided by sales (item SALE) for the fiscal year ending in calendar year t - 1. Ato is sales for the fiscal year ending in calendar year t - 1 divided by Noa for the fiscal year ending in t - 2. At the end of June of each year t, we sort stocks into three sets of deciles based on Rna, Pm, and Ato. We exclude firms with non-positive Noa for the fiscal year ending in calendar year t - 2 when forming the Rna and the Ato portfolios. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.4.6,Cto,Capital Turnover,Haugen and Baker,1996,Profitability,data_rawa['cto'] = data_rawa['sale'] / data_rawa['at'].shift(1),,0,1,0,0,"At the end of June of each year t, we split stocks into deciles based on capital turnover, Cto, measured as sales (Compustat annual item SALE) for the fiscal year ending in calendar year t - 1 divided by total assets (item AT) for the fiscal year ending in t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.4.7,"Rna q 1, Rna q 6, Atoq1","Quarterly Return on Net Operating Assets, Quarterly Asset Turnover",Soliman,2008,Profitability,data_rawq['rna'] = data_rawq['oiadpq']/data_rawq['noa_l4'],"* noa -data_rawq['noa_l4'] = data_rawq.groupby(['permno'])['noa'].shift(4)",1,0,1,0,"Quarterly return on net operating assets, Rnaq, is quarterly operating income after depreciation (Compustat quarterly item OIADPQ) divided by one-quarter-lagged net operating assets (Noa). Noa is operating assets minus operating liabilities. Operating assets are total assets (item ATQ) minus cash and short-term investments (item CHEQ), and minus other investment and advances (item IVAOQ, zero if missing). Operating liabilities are total assets minus debt in current liabilities (item DLCQ, zero if missing), minus long-term debt (item DLTTQ, zero if missing), minus minority interests (item MIBQ, zero if missing), minus preferred stocks (item PSTKQ, zero if missing), and minus common equity (item CEQQ). Quarterly profit margin, Pmq, is quarterly operating income after depreciation divided by quarterly sales (item SALEQ). Quarterly asset turnover, Atoq, is quarterly sales divided by one-quarter-lagged Noa. -At the beginning of each month t, we sort stocks into deciles based on Rnaq or Pmq for the latest fiscal quarter ending at least four months ago. Separately, we sort stocks into deciles based on Atoq computed with quarterly sales from the most recent earnings announcement dates (item RDQ). Sales are generally announced with earnings during quarterly earnings announcements (Je- gadeesh and Livnat 2006). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent Atoq to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for month t (Rnaq1, Pmq1, and Atoq1), from month t to t+5 (Rnaq6, Pmq6, and Atoq6), and from month t to t+11 (Rnaq12, Pmq12, and Atoq12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Atoq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Atoq6 decile. For sufficient data coverage, the Rnaq portfolios start in January 1976 and the Atoq portfolios start in January 1972. -" -A.4.8,"Cto q1, Cto q6,",Quarterly Capital Turnover,Haugen and Baker,1996,Profitability,,,,,,,"Quarterly capital turnover, Ctoq, is quarterly sales (Compustat quarterly item SALEQ) scaled by one-quarter-lagged total assets (item ATQ). At the beginning of each month t, we sort stocks into deciles based on Ctoq computed with quarterly sales from the most recent earnings announcement dates (item RDQ). Sales are generally announced with earnings during quarterly earnings announce- ments (Jegadeesh and Livnat 2006). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent Atoq to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for month t (Ctoq1), from month t to t+5 (Ctoq6), and from month t to t + 11 (Ctoq12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Ctoq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Ctoq6 decile. For sufficient data coverage, the Ctoq portfolios start in January 1972. -" -A.4.9,Gpa,Gross Profits-to-assets,Novy-Marx,2013,Profitability,,,,,,,"Following Novy-Marx (2013), we measure gross profits-to-assets, Gpa, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS) divided by total assets (item AT, the denominator is current, not lagged, total assets). At the end of June of each year t, we sort stocks into deciles based on Gpa for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.5.1,Oca and Ioca,(Industry-adjusted) Organizational Capital-to-assets,Eisfeldt and Papanikolaou,2013,Intangibles,,,,,,,p101 -A.5.11,Rca,R&D Capital-to-assets,Li,2011,Intangibles,,,,,,,"Following Li (2011), we measure R&D capital, Rc, by accumulating annual R&D expenses over the past five years with a linear depreciation rate of 20%: -Rcit = XRDit + 0.8 XRDit-1 + 0.6 XRDit-2 + 0.4 XRDit-3 + 0.2 XRDit-4, (A18) -in which XRDit-j is firm i's R&D expenses (Compustat annual item XRD) in year t - j. R&D capital-to-assets, Rca, is Rc scaled by total assets (item AT). At the end of June of each year t, we sort stocks into deciles based on Rca for the fiscal year ending in calendar year t - 1. We keep only firms with positive Rc. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. For portfolio formation at the end of June of year t, we require R&D expenses to be non-missing for the fiscal year ending in calendar year t - 1, because this value of R&D expenses receives the highest weight in Rc. Because Rc requires past five years of R&D expenses data and the accounting treatment of R&D expenses was standardized in 1975, the Rca portfolios start in July 1980." -A.5.2,Adm,Advertising Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,data_rawa['adm'] = data_rawa['xad']/data_rawa['me'],* me from rawa,1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on advertising expenses-to- market, Adm, which is advertising expenses (Compustat annual item XAD) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Adm. We keep only firms with positive advertising expenses. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because sufficient XAD data start in 1972, the Adm portfolios start in July 1973." -A.5.24,Etr,Effective Tax Rate,Abarbanell and Bushee,1998,Intangibles,data_rawa['etr'] = (data_rawa['txtpi'] - (data_rawa['txtpi_l1'] + data_rawa['txtpi_l2'] + data_rawa['txtpi_l3'])/3) * data_rawa['deps'],"data_rawa['txtpi'] = data_rawa['txt'] / data_rawa['pi'] -data_rawa['txtpi_l1'] = data_rawa.groupby('permno')['txtpi'].shift(1) -data_rawa['txtpi_l2'] = data_rawa.groupby('permno')['txtpi'].shift(2) -data_rawa['txtpi_l3'] = data_rawa.groupby('permno')['txtpi'].shift(3) -data_rawa['deps'] = data_rawa['epspx']/(data_rawa['ajex'] * data_rawa['prcc_f'])",0,1,0,0,p108 -A.5.4,Rdm,R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'],"* me from rawq -# rd -data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) -data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4'])",1,0,1,0,"At the end of June of each year t, we sort stocks into deciles based on R&D-to-market, Rdm, which is R&D expenses (Compustat annual item XRD) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Rdm. We keep only firms with positive R&D expenses. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the accounting treatment of R&D expenses was standardized in 1975, the Rdm portfolios start in July 1976." -A.5.46,"Alm q 1, Alm q 6, and Alm q 12",Quarterly Asset Liquidity,Ortiz-Molina and Phillips,2014,Intangibles,data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']),"data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ - 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) -* me from rawq",1,0,1,0,"We measure quarterly asset liquidity as cash + 0.75 * noncash current assets + 0.50 * tangible fixed assets. Cash is cash and short-term investments (Compustat quarterly item CHEQ). Noncash current assets is current assets (item ACTQ) minus cash. Tangible fixed assets is total assets (item ATQ) minus current assets (item ACTQ), minus goodwill (item GDWLQ, zero if missing), and minus intangibles (item INTANQ, zero if missing). Alaq is quarterly asset liquidity scaled by one- quarter-lagged total assets. Almq is quarterly asset liquidity scaled by one-quarter-lagged market value of assets. Market value of assets is total assets plus market equity (item PRCCQ times item CSHOQ) minus book equity (item CEQQ). -At the beginning of each month t, we sort stocks into deciles based on Alaq, and separately, on Almq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for the current month t (Alaq1 and Almq1), from month t to t + 5 (Alaq6 and Almq6), and from month t to t+11 (Alaq12 and Almq12). The deciles are rebalanced at the beginning of month t+1. The holding period longer than one month as in Alaq6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Alaq6 decile. For sufficient data coverage, the quarterly asset liquidity portfolios start in January 1976. -" -A.5.5,"Rdm q 1, Rdm q 6, and Rdm q 12",Quarterly R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,,,,,,"At the beginning of each month t, we split stocks into deciles based on quarterly R&D-to-market, Rdmq, which is quarterly R&D expense (Compustat quarterly item XRDQ) for the fiscal quarter ending at least four months ago scaled by the market equity (from CRSP) at the end of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Rdmq. We keep only firms with positive R&D expenses. We calculate decile returns for the current month t (Rdmq1), from month t to t + 5 (Rdmq6), and from month t to t + 11 (Rdmq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Rdmq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Rdmq6 decile. Because the quarterly R&D data start in late 1989, the Rdmq portfolios start in January 1990." -A.5.50,"R a 1 , R n 1 , R a [2,5] , R n[2,5] , R a[6,10] , R n[6,10] , R a[11,15] , and R a[16,20]",Seasonality,Heston and Sadka,2008,Intangibles,,"* crsp_mom -#Rla -crsp_mom['rla'] = crsp_mom.groupby(['permno'])['ret'].shift(12) - -#Rln -lag = pd.DataFrame() -result = 0 -for i in range(1, 12): - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - result = result + lag['mom%s' % i] -crsp_mom['rln'] = result/11 - -#R[2,5]a -#R[2,5]n -lag = pd.DataFrame() -result = 0 -for i in range(13,61): - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - if i not in [24,36,48,60]: - result = result + lag['mom%s' % i] - -crsp_mom['r25a'] = (lag['mom24']+lag['mom36']+lag['mom48']+lag['mom60'])/4 -crsp_mom['r25n'] = result/44 - -#R[6,10]a -#R[6,10]n -lag = pd.DataFrame() -result = 0 -for i in range(61,121): - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - if i not in [72,84,96,108,120]: - result = result + lag['mom%s' % i] - -crsp_mom['r610a'] = (lag['mom72']+lag['mom84']+lag['mom96']+lag['mom108']+lag['mom120'])/5 -crsp_mom['r610n'] = result/55 - -#R[11,15]a -lag = pd.DataFrame() -result = 0 -for i in [132,144,156,168,180]: - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - result = result + lag['mom%s' % i] -crsp_mom['r1115a'] = result/5 - -#R[16,20]a -lag = pd.DataFrame() -result = 0 -for i in [192,204,216,228,240]: - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - result = result + lag['mom%s' % i] -crsp_mom['r1620a'] = result/5",1,0,0,0,"Following Heston and Sadka (2008), at the beginning of each month t, we sort stocks into deciles -based on various measures of past performance, including returns in month t - 12 (Ra1), average -returns from month t - 11 to t - 1 (Rn1), average returns across months t - 24,t - 36,t - 48, and -t - 60 (R[2,5]), average returns from month t - 60 to t - 13 except for lags 24, 36, 48, and 60 (R[2,5]), an -average returns across months t - 72, t - 84, t - 96, t - 108, and t - 120 (R[6,10]), average returns a -from month t - 120 to t - 61 except for lags 72, 84, 96, 108, and 120 (R[6,10]), average returns across n -months t - 132, t - 144, t - 156, t - 168, and t - 180 (R[11,15]), average returns from month t - 180 a -to t - 121 except for lags 132, 144, 156, 168, and 180 (R[11,15]), average returns across months n -t-192,t-204,t-216,t-228, and t-240 (R[16,20]), average returns from month t-240 to t-181 a -except for lags 192, 204, 216, 228, and 240 (R[16,20]). Monthly decile returns are calculated for the n -current month t, and the deciles are rebalanced at the beginning of month t + 1." -A.5.6,Rds q 6 and Rds q 12,Quarterly R&D Expense-to-sales,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,data_rawq['rds'] = data_rawq['xrdq4']/data_rawq['saleq'],* xrdq4 from rdm,0,0,1,0,"At the end of June of each year t, we sort stocks into deciles based on R&D-to-sales, Rds, which is R&D expenses (Compustat annual item XRD) divided by sales (item SALE) for the fiscal year ending in calendar year t - 1. We keep only firms with positive R&D expenses. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the accounting treatment of R&D expenses was standardized in 1975, the Rds portfolios start in July 1976." -A.5.8,Ol,Operating Leverage,Novy-Marx,2011,Intangibles,data_rawa['ol'] = (data_rawa['cogs'] + data_rawa['xsga'])/data_rawa['at'],,0,1,0,0,"Following Novy-Marx (2011), operating leverage, Ol, is operating costs scaled by total assets (Com- pustat annual item AT, the denominator is current, not lagged, total assets). Operating costs are cost of goods sold (item COGS) plus selling, general, and administrative expenses (item XSGA). At the end of June of year t, we sort stocks into deciles based on Ol for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." -A.5.9,"Ol q 1, Ol q 6, and Ol q 12",Quarterly Operating Leverage,Novy-Marx,2011,Intangibles,data_rawq['olq'] = (data_rawq['cogsq'] + data_rawq['xsgaq'])/data_rawq['atq'],,0,0,1,0,"At the beginning of each month t, we split stocks into deciles based on quarterly operating leverage, Olq, which is quarterly operating costs divided by assets (Compustat quarterly item ATQ) for the fiscal quarter ending at least four months ago. Operating costs are the cost of goods sold (item COGSQ) plus selling, general, and administrative expenses (item XSGAQ). We calculate decile returns for the current month t (Olq1), from month t to t + 5 (Olq6), and from month t to t + 11 (Olq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Olq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Olq6 decile. For sufficient data coverage, the Olq portfolios start in January 1972." -A.6.1,Me,the market equity,Banz,1981,Frictions,"# rawq['me'] -crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity -crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) -data_rawq['me'] = data_rawq['me']/1000 # CRSP ME -data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) -data_rawq = data_rawq.dropna(subset=['me'])","#rawa['me'] -crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity -crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) -data_rawa['me'] = data_rawa['me']/1000 # CRSP ME -# there are some ME equal to zero since this company do not have price or shares data, we drop these observations -data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) -data_rawa = data_rawa.dropna(subset=['me']) -# rawq['me'] -crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity -crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) -data_rawq['me'] = data_rawq['me']/1000 # CRSP ME -# there are some ME equal to zero since this company do not have price or shares data, we drop these observations -data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) -data_rawq = data_rawq.dropna(subset=['me'])",1,1,1,0,"Market equity, Me, is price times shares outstanding from CRSP. At the end of June of each year t, we sort stocks into deciles based on the June-end Me. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." -A.6.13,Dtv12,"dollar trading volume, 12-month holding period","Brennan, Chordia, and Subrahmanyam",1998,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on their average daily dollar trading volume, Dtv, over the prior six months from t-6 to t-1. We require a minimum of 50 daily observations. Dollar trading volume is share price times the number of shares traded. We adjust the trading volume of NASDAQ stocks per Gao and Ritter (2010) (see footnote 7). Monthly decile returns are calculated for the current month t (Dtv1), from month t to t+5 (Dtv6), and from month t to t + 11 (Dtv12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Dtv6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Dtv6 decile." -A.6.21,Isff1,"idiosyncratic skewness estimated from the Fama-French 3-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on idiosyncratic skewness, Isff, calculated as the skewness of the residuals from regressing a stock's excess return on the Fama- French three factors using daily observations from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Isff1), from month t to t + 5 (Isff6), and from month t to t + 11 (Isff12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in Isff6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Isff6 decile." -A.6.22,Isq1,"idiosyncratic skewness estimated from the q-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on idiosyncratic skewness, Isq, calculated as the skewness of the residuals from regressing a stock's excess return on the q-factors using daily observations from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Isq1), from month t to t + 5 (Isq6), and from month t to t + 11 (Isq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in Isq6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Isq6 decile. Because the q-factors start in January 1967, the Ivq portfolios start in February 1967." -A.6.24,Srev,short-term reversal,Jegadeesh,1990,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into short-term reversal (Srev) deciles based on the return in month t - 1. To be included in a decile in month t, a stock must have a valid price at the end of month t - 2 and a valid return for month t - 1. Monthly decile returns are calculated for the current month t, and the deciles are rebalanced at the beginning of month t + 1." -A.6.3,Ivff1,"idiosyncratic volatility estimated from the Fama-French 3-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,"Following Ang, Hodrick, Xing, and Zhang (2006), we calculate idiosyncratic volatility relative to the Fama-French three-factor model, Ivff, as the residual volatility from regressing a stock's excess returns on the Fama-French three factors. At the beginning of each month t, we sort stocks into deciles based on the Ivff estimated with daily returns from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Ivff1), from month t to t+5 (Ivff6), and from month t to t+11 (Ivff12), and the deciles are rebalanced at the beginning of month t + 1. The holding period that is longer than one month as in, for instance, Ivff6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Ivff6 decile." -A.6.5,Ivq1,"idiosyncratic volatility estimated from the q-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,"We calculate idiosyncratic volatility per the q-factor model, Ivq, as the residual volatility from regressing a stock's excess returns on the q-factors. At the beginning of each month t, we sort stocks into deciles based on the Ivq estimated with daily returns from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Ivq1), from month t to t + 5 (Ivq6), and from month t to t + 11 (Ivq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period that is longer than one month as in, for instance, Ivq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Ivq6 decile. Because the q-factors start in January 1967, the Ivq portfolios start in February 1967." -A.6.6,Tv1,"total volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,"Following Ang, Hodrick, Xing, and Zhang (2006), at the beginning of each month t, we sort stocks into deciles based on total volatility, Tv, estimated as the volatility of a stock's daily returns from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t, (Tv1), from month t to t + 5 (Tv6), and from month t to t + 11 (Tv12), and the deciles are rebalanced at the beginning of month t + 1. The holding period that is longer than one month as in, for instance, Tv6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdeciles returns as the monthly return of the Tv6 decile." -A.6.7,Sv1,"systematic volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,p119 -A.6.8,Beta1,Market Beta,Fama and MacBeth,1973,Frictions,,,,,,,p119 -,agr,Asset growth,"Cooper, Gulen & Schill",2008,,data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'],data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4),1,0,1,0,Annual percent change in total assets (at). -,baspread,Bid-ask spread rolling 3m,Amihud & Mendelson,1989,,,,,,,,Monthly average of daily bid-ask spread divided by average of daily spread. -,beta,Beta rolling 3m,Fama & MacBeth,1973,,,,,,,,Estimated market beta from weekly returns and equal weighted market returns for 3 years ending month t-1 with at least 52 weeks of returns. -,bm_ia,Industry-adjusted book to market,"Asness, Porter & Stevens",2000,,data_rawa['bm_ia'] = data_rawa['bm']/data_rawa['bm_ind'],"df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() -df_temp = df_temp.rename(columns={'bm': 'bm_ind'}) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49'])",0,1,0,0,Industry adjusted book-to-market ratio. -,cash,Cash holdings,Palazzo,2012,,data_rawq['cash'] = data_rawq['cheq']/data_rawq['atq'],,0,0,1,0,Cash and cash equivalents divided by average total assets. -,cashdebt,Cash flow to debt,Ou & Penman,1989,,"data_rawq['cashdebt'] = (ttm4('ibq', data_rawq) + ttm4('dpq', data_rawq))/((data_rawq['ltq']+data_rawq['ltq_l4'])/2)",data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4),1,0,1,0,Earnings before depreciation and extraordinary items (ib+dp) divided by avg. total liabilities (lt). -,chcsho,Change in shares outstanding,Pontiff & Woodgate,2008,,data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1,data_rawq['cshoq_l4'] = data_rawq.groupby(['permno'])['cshoq'].shift(4),1,0,1,0,Annual percent change in shares outstanding (csho). -,chpm(chpmia),Industry-adjusted change in profit margin,Soliman,2008,,data_rawq['chpm'] = (data_rawq['ibq4']/data_rawq['saleq4'])-(data_rawq['ibq4_l1']/data_rawq['saleq4_l1']),"data_rawq['ibq4'] = ttm4('ibq', data_rawq) -data_rawq['saleq4'] = ttm4('saleq', data_rawq) -data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) -data_rawq['ibq4_l1'] = data_rawq.groupby(['permno'])['ibq4'].shift(1) -data_rawq['saleq4_l1'] = data_rawq.groupby(['permno'])['saleq4'].shift(1)",1,0,1,0,2-digit SIC - fiscal-year mean adjusted change in income before extraordinary items (ib) divided by sales (sale). -,chtx,Change in tax expense,Thomas & Zhang,2011,,data_rawq['chtx'] = (data_rawq['txtq']-data_rawq['txtq_l4'])/data_rawq['atq_l4'],"data_rawq['txtq_l4'] = data_rawq.groupby(['permno'])['txtq'].shift(4) -data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4)",1,0,1,0,Percent change in total taxes (txtq) from quarter t-4 to t. -,cinvest,Corporate investment,"Titman, Wei & Xie",2004,,"* data_rawq['cinvest'] = ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / data_rawq['saleq'])\ - -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1))","data_rawq['ppentq_l1'] = data_rawq.groupby(['permno'])['ppentq'].shift(1) -data_rawq['ppentq_l2'] = data_rawq.groupby(['permno'])['ppentq'].shift(2) -data_rawq['ppentq_l3'] = data_rawq.groupby(['permno'])['ppentq'].shift(3) -data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) -data_rawq['saleq_l1'] = data_rawq.groupby(['permno'])['saleq'].shift(1) -data_rawq['saleq_l2'] = data_rawq.groupby(['permno'])['saleq'].shift(2) -data_rawq['saleq_l3'] = data_rawq.groupby(['permno'])['saleq'].shift(3) - -data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / data_rawq['saleq_l1'] -data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / data_rawq['saleq_l2'] -data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / data_rawq['saleq_l3'] - -* main formula - -data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / 0.01 -data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / 0.01 -data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / 0.01 - -data_rawq['cinvest'] = np.where(data_rawq['saleq']<=0, ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / 0.01) - -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)), data_rawq['cinvest']) - -data_rawq = data_rawq.drop(['c_temp1', 'c_temp2', 'c_temp3'], axis=1)",1,0,1,0,"Change over one quarter in net PP&E (ppentq) divided by sales (saleq) - average of this variable for prior 3 quarters; if saleq = 0, then scale by 0.01." -,depr,Depreciation / PP&E,Holthausen & Larcker,1992,,"data_rawq['depr'] = ttm4('dpq', data_rawq)/data_rawq['ppentq']",,0,0,1,0,Depreciation divided by PP&E. -,dolvol,Dollar trading volume,"Chordia, Subrahmanyam & Anshuman",2001,,"crsp_mom['dolvol'] = np.log(crsp_mom['vol_l2']*crsp_mom['prc_l2']).replace([np.inf, -np.inf], np.nan)",,1,0,0,0,Natural log of trading volume times price per share from month t-2. -,gma,Gross profitability,Novy-Marx,2013,,data_rawq['gma'] = (data_rawq['revtq4']-data_rawq['cogsq4'])/data_rawq['atq_l4'],"data_rawq['revtq4'] = ttm4('revtq', data_rawq) -data_rawq['cogsq4'] = ttm4('cogsq', data_rawq) -data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4)",1,0,1,0,Revenues (revt) minus cost of goods sold (cogs) divided by lagged total assets (at). -,grltnoa,Growth in long-term net operating assets,"Fairfield, Whisenant & Yohn",2003,,"data_rawq['grltnoa'] = ((data_rawq['rectq']+data_rawq['invtq']+data_rawq['ppentq']+data_rawq['acoq']+data_rawq['intanq']+ - data_rawq['aoq']-data_rawq['apq']-data_rawq['lcoq']-data_rawq['loq'])- - (data_rawq['rectq_l4']+data_rawq['invtq_l4']+data_rawq['ppentq_l4']+data_rawq['acoq_l4']-data_rawq['apq_l4']-data_rawq['lcoq_l4']-data_rawq['loq_l4'])-\ - (data_rawq['rectq']-data_rawq['rectq_l4']+data_rawq['invtq']-data_rawq['invtq_l4']+data_rawq['acoq']- - (data_rawq['apq']-data_rawq['apq_l4']+data_rawq['lcoq']-data_rawq['lcoq_l4'])- - ttm4('dpq', data_rawq)))/((data_rawq['atq']+data_rawq['atq_l4'])/2)","data_rawq['rectq_l4'] = data_rawq.groupby(['permno'])['rectq'].shift(4) -data_rawq['acoq_l4'] = data_rawq.groupby(['permno'])['acoq'].shift(4) -data_rawq['apq_l4'] = data_rawq.groupby(['permno'])['apq'].shift(4) -data_rawq['lcoq_l4'] = data_rawq.groupby(['permno'])['lcoq'].shift(4) -data_rawq['loq_l4'] = data_rawq.groupby(['permno'])['loq'].shift(4) -data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) -data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) -data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4)",1,0,1,0,Growth in long term net operating assets. -,herf,Industry sales concentration,Hou & Robinson,2006,,data_rawa['herf'] = (data_rawa['sale']/data_rawa['indsale'])*(data_rawa['sale']/data_rawa['indsale']),"data_rawa['sic'] = data_rawa['sic'].astype(int) -data_rawa['ffi49'] = ffi49(data_rawa) -data_rawa['ffi49'] = data_rawa['ffi49'].fillna(49) -data_rawa['ffi49'] = data_rawa['ffi49'].astype(int) -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['sale'].sum() -df_temp = df_temp.rename(columns={'sale': 'indsale'}) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) -* main formula -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['herf'].sum() -data_rawa = data_rawa.drop(['herf'], axis=1) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49'])",0,1,0,0,2-digit SIC - fiscal-year sales concentration (sum of squared percent of sales in industry for each company). -,hire,Employee growth rate,"Bazdresch, Belo & Lin",2014,,"data_rawa['hire'] = (data_rawa['emp'] - data_rawa['emp_l1'])/data_rawa['emp_l1'] -data_rawa['hire'] = np.where((data_rawa['emp'].isnull()) | (data_rawa['emp_l1'].isnull()), 0, data_rawa['hire'])",data_rawa['emp_l1'] = data_rawa.groupby(['permno'])['emp'].shift(1),1,1,0,0,Percent change in number of employees (emp). -,ill,Illiquidity rolling 3m,Amihud,2002,,,,,,,,Average of daily (absolute return / dollar volume). -,lev,Leverage,Bhandari,1988,,data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'],* me from rawq,0,0,1,0,Total liabilities (lt) divided by fiscal year end market capitalization. -,lgr,Growth in long-term debt,"Richardson, Sloan, Soliman & Tuna",2005,,data_rawq['lgr'] = (data_rawq['ltq']/data_rawq['ltq_l4'])-1,data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4),1,0,1,0,Annual percent change in total liabilities (lt). -,maxret,Maximum daily returns rolling 3m,"Bali, Cakici & Whitelaw",2011,,,,,,,,Maximum daily return from returns during calendar month t-1. -,me_ia(mve_ia),Industry-adjusted size,"Asness, Porter & Stevens",2000,,data_rawa['me_ia'] = data_rawa['me']/data_rawa['me_ind'],"* me from rawa -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['me'].mean() -df_temp = df_temp.rename(columns={'me': 'me_ind'}) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49'])",1,1,0,0,2-digit SIC industry-adjusted fiscal year-end market capitalization. -,mom12m,Momentum rolling 12m,Jegadeesh,1990,,"crsp_mom['mom12m'] = mom(1, 12, crsp_mom)",* crsp_mom,1,0,0,0,11-month cumulative returns ending one month before month end. -,mom1m,Momentum ,Jegadeesh & Titman,1993,,crsp_mom['mom1m'] = crsp_mom['ret'],* crsp_mom,1,0,0,0,1-month cumulative return. -,mom36m,Momentum rolling 36m,Jegadeesh & Titman,1993,,"crsp_mom['mom36m'] = mom(1, 36, crsp_mom)",* crsp_mom,1,0,0,0,Cumulative returns from months t-36 to t-13. -,mom60m,Momentum rolling 60m,Jegadeesh & Titman,1993,,"crsp_mom['mom60m'] = mom(12, 60, crsp_mom)",* crsp_mom,1,0,0,0, -,mom6m,Momentum rolling 6m,Jegadeesh & Titman,1993,,"crsp_mom['mom6m'] = mom(1, 6, crsp_mom)",* crsp_mom,1,0,0,0,5-month cumulative returns ending one month before month end. -,nincr,Number of earnings increases,"Barth, Elliott & Finn",1999,,"data_rawq['nincr'] = (data_rawq['nincr_temp1'] - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']*data_rawq['nincr_temp8']))","data_rawq['ibq_l1'] = data_rawq.groupby(['permno'])['ibq'].shift(1) -data_rawq['ibq_l2'] = data_rawq.groupby(['permno'])['ibq'].shift(2) -data_rawq['ibq_l3'] = data_rawq.groupby(['permno'])['ibq'].shift(3) -data_rawq['ibq_l4'] = data_rawq.groupby(['permno'])['ibq'].shift(4) -data_rawq['ibq_l5'] = data_rawq.groupby(['permno'])['ibq'].shift(5) -data_rawq['ibq_l6'] = data_rawq.groupby(['permno'])['ibq'].shift(6) -data_rawq['ibq_l7'] = data_rawq.groupby(['permno'])['ibq'].shift(7) -data_rawq['ibq_l8'] = data_rawq.groupby(['permno'])['ibq'].shift(8) - -data_rawq['nincr_temp1'] = np.where(data_rawq['ibq'] > data_rawq['ibq_l1'], 1, 0) -data_rawq['nincr_temp2'] = np.where(data_rawq['ibq_l1'] > data_rawq['ibq_l2'], 1, 0) -data_rawq['nincr_temp3'] = np.where(data_rawq['ibq_l2'] > data_rawq['ibq_l3'], 1, 0) -data_rawq['nincr_temp4'] = np.where(data_rawq['ibq_l3'] > data_rawq['ibq_l4'], 1, 0) -data_rawq['nincr_temp5'] = np.where(data_rawq['ibq_l4'] > data_rawq['ibq_l5'], 1, 0) -data_rawq['nincr_temp6'] = np.where(data_rawq['ibq_l5'] > data_rawq['ibq_l6'], 1, 0) -data_rawq['nincr_temp7'] = np.where(data_rawq['ibq_l6'] > data_rawq['ibq_l7'], 1, 0) -data_rawq['nincr_temp8'] = np.where(data_rawq['ibq_l7'] > data_rawq['ibq_l8'], 1, 0) - -*main formula - -data_rawq = data_rawq.drop(['ibq_l1', 'ibq_l2', 'ibq_l3', 'ibq_l4', 'ibq_l5', 'ibq_l6', 'ibq_l7', 'ibq_l8', 'nincr_temp1', - 'nincr_temp2', 'nincr_temp3', 'nincr_temp4', 'nincr_temp5', 'nincr_temp6', 'nincr_temp7', - 'nincr_temp8'], axis=1)",1,0,1,0,Number of consecutive quarters (up to eight quarters) with an increase in earnings (ibq) over same quarter in the prior year. -,op(operprof),Operating profitability,Fama and French,2015,,,,,,,, -,pscore(ps),Performance Score,Piotroski,2000,,"data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) -data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) -data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps'])",#(pstkrv prior to pstkl prior to pstk),0,1,0,0,Sum of 9 indicator variables to form fundamental health score. -,rd_sale,R&D to sales,"Guo, Lev & Shi",2006,,data_rawq['rd_sale'] = data_rawq['xrdq4']/data_rawq['saleq4'],"data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) -data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4']) -data_rawq['saleq4'] = ttm4('saleq', data_rawq) -data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4'])",0,0,1,0,R&D expense divided by sales (xrd/sale). -,re,Revisions in analysts’ earnings forecasts,"Chan, Jegadeesh, and Lakonishok",1996,,,,,,,, -,rsup,Revenue surprise,Kama,2009,,data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'],data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4),1,0,1,0,Sales from quarter t minus sales from quarter t-4 (saleq) divided by fiscal-quarter- end market capitalization (cshoq * prccq). -,rvar_capm,Residual variance - CAPM rolling 3m,Daily Stock residual variance of CAPM,,,,,,,,, -,rvar_ff3,Residual variance - ff3 rolling 3m,Daily Stock residual variance of Fama French 3 factors,,,,,,,,, -,rvar_mean,return variance rolling 3m,Daily Stock return variance,,,,,,,,, -,sgr,Sales growth,"Lakonishok, Shleifer & Vishny",1994,,data_rawq['sgr'] = (data_rawq['saleq4']/data_rawq['saleq4_l4'])-1,"data_rawq['saleq4'] = ttm4('saleq', data_rawq) -data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) - -data_rawq['saleq4_l4'] = data_rawq.groupby(['permno'])['saleq4'].shift(4)",1,0,1,0,Annual percent change in sales (sale). -,std_dolvol,Std of dollar trading volume rolling 3m,"Chordia, Subrahmanyam & Anshuman",2001,,,,,,,,Monthly standard deviation of daily dollar trading volume. -,std_turn,Std. of Share turnover rolling 3m,"Chordia, Subrahmanyam, &Anshuman",2001,,,,,,,,Monthly standard deviation of daily share turnover. -,sue,Unexpected quarterly earnings,"Rendelman, Jones & Latane",1982,,,,,,,,"Unexpected quarterly earnings divided by fiscal-quarter-end market cap. Unexpected earnings is I/B/E/S actual earnings minus median forecasted earnings if available, else it is the seasonally differenced quarterly earnings before extraordinary items from Compustat quarterly file." -,turn,Shares turnover,"Datar, Naik & Radcliffe",1998,,,,,,,,Average monthly trading volume for most recent 3 months scaled by number of shares outstanding in current month. -,zerotrade,Number of zero-trading days rolling 3m,Liu,2006,,,,,,,,Turnover weighted number of zero trading days for most recent 1 month. diff --git a/README.md b/README.md deleted file mode 100755 index d1cb73f..0000000 --- a/README.md +++ /dev/null @@ -1,109 +0,0 @@ -- All in Python -- The SAS version is here [EquityCharacteristicsSAS](https://feng-cityuhk.github.io/EquityCharacteristicsSAS/) - -## Academic Background - -For financial researches, we need equity characteristics. This repository is a toolkit to calculate asset characteristics in individual equity level and portfolio level. - -## Prerequisite - -- Read the listed papers -- [WRDS](https://wrds-web.wharton.upenn.edu) account with subscription to CRSP, Compustat and IBES. -- Python - -## Files - -- [Characteristics list](https://github.com/ericma4/EquityCharacteristics/blob/master/Chars60_description.csv) - -### Main Files -- accounting.py -- most annual, quarterly and monthly frequency characteristics -- functions.py -- impute and rank functions -- merge_chars.py -- merge all the characteristics from different pickle file into one pickle file -- impute_rank_output_bchmk.py -- impute the missing values and standardize raw data -- iclink.py -- preparation for IBES -- pkl_to_csv.py -- converge the pickle file to csv - -### Single Characteristic Files -- beta.py -- 3 months rolling CAPM beta -- rvar_capm.py, rvar_ff3.py -- residual variance of CAPM and fama french 3 factors model, rolling window is 3 months -- rvar_mean.py -- variance of return, rolling window is 3 months -- abr.py -- cumulative abnormal returns around earnings announcement dates -- re.py -- revisions in analysts’ earnings forecasts -- sue.py -- unexpected quarterly earnings -- ill.py -- illiquidity, rolling window is 3 months -- maxret_d.py -- maximum daily returns, rolling window is 3 months -- std_dolvol.py -- std of dollar trading volume, rolling window is 3 months -- std_turn.py -- std of share turnover, rolling window is 3 months -- bid_ask_spread.py -- bid-ask spread, rolling window is 3 months -- zerotrade.py -- number of zero-trading days, rolling window is 3 months - -## How to use - -1. run accounting.py -2. run all the single characteristic files -3. run merge_chars.py -4. run impute_rank_output_bckmk.py (you may want to commen the part of sp1500 in this file if you just need the all stocks version) - -## Outputs - -### Data - -The date range is 1972 to 2019. The stock universe is top 3 exchanges (NYSE/AMEX/NASDAQ) in US. - -The currant time of data is $ret_t = chars_{t-1}$ - -1. chars_raw_no_impute.pkl (all data with original missing value) -2. chars_raw_imputed.pkl (impute missing value with industry median/mean value) -3. chars_rank_no_imputed.pkl (standardize chars_raw_no_impute.pkl) -4. chars_rank_imputed.pkl (standardize chars_raw_imputed.pkl) - -### Information Variables: - -- stock indicator: gvkey, permno -- time: datadate, date, year ('datadate' is the available time for data and 'date' is the date of return) -- industry: sic, ffi49 -- exchange info: exchcd, shrcd -- return: ret (we also provide original return and return without dividend, you can keep them by modifing impute_rank_output_bchmk.py) -- market equity: me/rank_me - -## Method - -### Equity Characteristics - -This topic is summaried by **Green Hand Zhang** and **Hou Xue Zhang**. - -### Portfolio Characteristics - -Portfolio charactaristics is the equal-weighted / value-weighted averge of the characteristics for all equities in the portfolio. - -The portfolios includes and not limited to: - -- Characteristics-sorted Portfolio, see the listed papers and also [Deep Learning in Characteristics-Sorted Factor Models](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3243683) -- DGTW Benchmark, see [DGTW 1997 JF](https://doi.org/10.1111/j.1540-6261.1997.tb02724.x) -- Industry portfolio - -## Reference - -### Papers - -Many papers contribute a lot to this repository. I am very sorry for only listing the following papers. -- **Measuring Mutual Fund Performance with Characteristic‐Based Benchmarks** by [DANIEL, GRINBLATT, TITMAN, WERMERS 1997 JF](https://doi.org/10.1111/j.1540-6261.1997.tb02724.x) - - [Benchmarks on Wermer's website](http://terpconnect.umd.edu/~wermers/ftpsite/Dgtw/coverpage.htm) - -- **Dissecting Anomalies with a Five-Factor Model** by [Fama and French 2015 RFS](https://doi.org/10.1093/rfs/hhv043) - - Define the characteristics of a portfolio as the value-weight averages (market-cap weights) of the variables for the firms in the portfolio - - [French's Data Library](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html) - -- **The Characteristics that Provide Independent Information about Average U.S. Monthly Stock Returns** by [Green Hand Zhang 2017 RFS](https://doi.org/10.1093/rfs/hhx019) - - [sas code from Green's website](https://drive.google.com/file/d/0BwwEXkCgXEdRQWZreUpKOHBXOUU/view) -- **Replicating Anormalies** by [Hou Xue Zhang 2018 RFS](https://doi.org/10.1093/rfs/hhy131) - - [Anormaly Portfolios by Zhang's website](http://global-q.org/index.html) - -### Codes - -- Calculate equity characteristics with SAS code, mainly refering to [SAS code by Green Hand Zhang](https://drive.google.com/file/d/0BwwEXkCgXEdRQWZreUpKOHBXOUU/view). -- Portfolio characteristics, mainly refering to [WRDS Financial Ratios Suite](https://wrds-www.wharton.upenn.edu/pages/support/research-wrds/sample-programs/wrds-sample-programs/wrds-financial-ratios-suite/) and [Variable Definition](https://wrds-www.wharton.upenn.edu/documents/793/WRDS_Industry_Financial_Ratio_Manual.pdf) -- DGTW code refers to [this python code](https://wrds-www.wharton.upenn.edu/pages/support/applications/python-replications/characteristic-based-benchmarks-daniel-grinblatt-titman-and-wermers-1997-python-version/) or [this SAS code](https://wrds-www.wharton.upenn.edu/pages/support/applications/portfolio-construction-and-market-anomalies/characteristic-based-benchmarks-daniel-grinblatt-titman-and-wermers-1997/) - -**All comments are welcome.** - diff --git a/char60/.DS_Store b/char60/.DS_Store deleted file mode 100755 index 07e0263348c876a177854d50dcae966bb406376e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}xR_5S}U`x@h3$MYB&pG$x)`f(Oq&z`FdYadFK8Mm?B4>pS=gKA$K2X4+sE z%GD6lA?@t8^UcrKLMaic_OyRO)F7f78tcj~I*0H)wk^m@*mD zGcl^@(Jd@_=8TS=zL_c8cL4i@o~7CeaL#(^931ak**Ae20COYxaMT2E4+*0yRZ`t_Zxp?KQ5`tE^Ur40-SL-8kOW z%8mgyYalqYS!#jSzV{4x20R0E2Ke_OL}MKpU1%*o9q8l=0A0bb2%h;b1!F>^Bcltg zF#}N=3az2aTrre}V>~o=k?>z&afieS|`nAF5 z|HbP2e;MT8JOiGAf5m{RbgnvWlw{7= '01/01/1959' - """) - -comp['datadate'] = pd.to_datetime(comp['datadate']) - -print('='*10, 'comp data is ready', '='*10) -################### -# CCM Block # -################### -ccm = conn.raw_sql(""" - select gvkey, lpermno as permno, linktype, linkprim, - linkdt, linkenddt - from crsp.ccmxpf_linktable - where linktype in ('LU', 'LC') - """) - -ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) -ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) - -# if linkenddt is missing then set to today date -ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) - -ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) -# extract month and year of rdq -ccm1['rdq'] = pd.to_datetime(ccm1['rdq']) - -# set link date bounds -ccm2 = ccm1[(ccm1['datadate']>=ccm1['linkdt']) & (ccm1['datadate']<=ccm1['linkenddt'])] -ccm2 = ccm2[['gvkey', 'datadate', 'rdq', 'fyearq', 'fqtr', 'permno']] - -################### -# CRSP Block # -################### - -# Report Date of Quarterly Earnings (rdq) may not be trading day, we need to get the first trading day on or after rdq -crsp_dsi = conn.raw_sql(""" - select distinct date - from crsp.dsi - where date >= '01/01/1959' - """) - -crsp_dsi['date'] = pd.to_datetime(crsp_dsi['date']) - -for i in range(6): # we only consider the condition that the day after rdq is not a trading day, which is up to 5 days - ccm2['trad_%s' % i] = ccm2['rdq'] + pd.DateOffset(days=i) # set rdq + i days to match trading day - crsp_dsi['trad_%s' % i] = crsp_dsi['date'] # set the merging key - crsp_dsi = crsp_dsi[['date', 'trad_%s' % i]] # reset trading day columns to avoid repeat merge - comp_temp = pd.merge(ccm2, crsp_dsi, how='left', on='trad_%s' % i) - comp_temp['trad_%s' % i] = comp_temp['date'] # reset rdq + i days to matched trading day - -# fill NA from rdq + 5 days to rdq + 0 days, then get trading day version of rdq -for i in range(5, 0, -1): - count = i-1 - comp_temp['trad_%s' % count] = np.where(comp_temp['trad_%s' % count].isnull(), - comp_temp['trad_%s' % i], comp_temp['trad_%s' % count]) - comp_temp['rdq_trad'] = comp_temp['trad_%s' % count] - -comp_temp = comp_temp[['gvkey', 'permno', 'datadate', 'fyearq', 'fqtr', 'rdq', 'rdq_trad']] - -print('='*10, 'crsp block is ready', '='*10) -############################# -# CRSP abnormal return # -############################# -crsp_d = conn.raw_sql(""" - select a.prc, a.ret, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.permno, a.permco, a.date, - b.siccd, b.ncusip, b.shrcd, b.exchcd - from crsp.dsf as a - left join crsp.dsenames as b - on a.permno=b.permno - and b.namedt<=a.date - and a.date<=b.nameendt - where a.date >= '01/01/1959' - and b.exchcd between 1 and 3 - and b.shrcd in (10,11) - """) - -# change variable format to int -crsp_d[['permco', 'permno', 'shrcd', 'exchcd']] = crsp_d[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) - -print('='*10, 'crsp abnormal return is ready', '='*10) - -# convert the date format -crsp_d['date'] = pd.to_datetime(crsp_d['date']) - -# add delisting return -dlret = conn.raw_sql(""" - select permno, dlret, dlstdt - from crsp.dsedelist - where dlstdt >= '01/01/1959' - """) - -dlret.permno = dlret.permno.astype(int) -dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) - -crsp_d = pd.merge(crsp_d, dlret, how='left', left_on=['permno', 'date'], right_on=['permno', 'dlstdt']) -# return adjusted for delisting -crsp_d['retadj'] = np.where(crsp_d['dlret'].notna(), (crsp_d['ret'] + 1)*(crsp_d['dlret'] + 1) - 1, crsp_d['ret']) -crsp_d['meq'] = crsp_d['prc'].abs()*crsp_d['shrout'] # market value of equity -crsp_d = crsp_d.sort_values(by=['date', 'permno', 'meq']) - -# sprtrn -crspsp500d = conn.raw_sql(""" - select date, sprtrn - from crsp.dsi - where date >= '01/01/1959' - """) - -crspsp500d['date'] = pd.to_datetime(crspsp500d['date']) - -# abnormal return -crsp_d = pd.merge(crsp_d, crspsp500d, how='left', on='date') -crsp_d['abrd'] = crsp_d['retadj'] - crsp_d['sprtrn'] -crsp_d = crsp_d[['date', 'permno', 'ret', 'retadj', 'sprtrn', 'abrd']] - -# date count regarding to rdq -comp_temp['minus10d'] = comp_temp['rdq_trad'] - pd.Timedelta(days=10) -comp_temp['plus5d'] = comp_temp['rdq_trad'] + pd.Timedelta(days=5) - -# df = sqldf("""select a.*, b.date, b.abrd -# from comp_temp a left join crsp_d b -# on a.permno=b.permno -# and a.minus10d<=b.date -# and b.date<=a.plus5d -# order by a.permno, a.rdq_trad, b.date;""", globals()) - -sql = sqlite3.connect(':memory:') -comp_temp.to_sql('comp_temp', sql, index=False) -crsp_d.to_sql('crsp_d', sql, index=False) - -qry = """select a.*, b.date, b.abrd - from comp_temp a left join crsp_d b - on a.permno=b.permno - and a.minus10d<=b.date - and b.date<=a.plus5d - order by a.permno, a.rdq_trad, b.date;""" -df = pd.read_sql_query(qry, sql) -df.drop(['plus5d', 'minus10d'], axis=1, inplace=True) - -# delete missing return -df = df[df['abrd'].notna()] - -# count -df.sort_values(by=['permno', 'rdq_trad', 'date'], inplace=True) -condlist = [df['date']==df['rdq_trad'], - df['date']>df['rdq_trad'], - df['date']=0] -df_after['count'] = df_after.groupby(['permno', 'rdq_trad'])['date'].cumcount() - -df = pd.concat([df_before, df_after]) - -# calculate abr as the group sum -df = df[(df['count']>=-2) & (df['count']<=1)] - -df_temp = df.groupby(['permno', 'rdq_trad'])['abrd'].sum() -df_temp = pd.DataFrame(df_temp) -df_temp.reset_index(inplace=True) -df_temp.rename(columns={'abrd': 'abr'}, inplace=True) -df = pd.merge(df, df_temp, how='left', on=['permno', 'rdq_trad'], copy=False) # add abr back to df -df = df[df['count']==1] -df.rename(columns={'date': 'rdq_plus_1d'}, inplace=True) -df = df[['gvkey', 'permno', 'datadate', 'rdq', 'rdq_plus_1d', 'abr']] - -print('='*10, 'start populate', '='*10) - -# populate the quarterly abr to monthly -crsp_msf = conn.raw_sql(""" - select distinct date - from crsp.msf - where date >= '01/01/1959' - """) - -df['datadate'] = pd.to_datetime(df['datadate']) -df['plus12m'] = df['datadate'] + np.timedelta64(12, 'M') -df['plus12m'] = df['plus12m'] + MonthEnd(0) - -# df = sqldf("""select a.*, b.date -# from df a left join crsp_msf b -# on a.rdq_plus_1d < b.date -# and a.plus12m >= b.date -# order by a.permno, b.date, a.datadate desc;""", globals()) - -df.to_sql('df', sql, index=False) -crsp_msf.to_sql('crsp_msf', sql, index=False) - -qry = """select a.*, b.date - from df a left join crsp_msf b - on a.rdq_plus_1d < b.date - and a.plus12m >= b.date - order by a.permno, b.date, a.datadate desc;""" - -df = pd.read_sql_query(qry, sql) - -df = df.drop_duplicates(['permno', 'date']) -df['datadate'] = pd.to_datetime(df['datadate']) -df['rdq'] = pd.to_datetime(df['rdq']) -df['rdq_plus_1d'] = pd.to_datetime(df['rdq_plus_1d']) -df = df[['gvkey', 'permno', 'datadate', 'rdq', 'rdq_plus_1d', 'abr', 'date']] - -with open('abr.pkl', 'wb') as f: - pkl.dump(df, f) \ No newline at end of file diff --git a/char60/accounting_100.py b/char60/accounting_100.py deleted file mode 100644 index cf88aa6..0000000 --- a/char60/accounting_100.py +++ /dev/null @@ -1,1643 +0,0 @@ -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import pickle as pkl -from functions import * - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -####################################################################################################################### -# TTM functions # -####################################################################################################################### - - -def ttm4(series, df): - """ - - :param series: variables' name - :param df: dataframe - :return: ttm4 - """ - lag = pd.DataFrame() - for i in range(1, 4): - lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('gvkey')['%s' % series].shift(i) - result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] - return result - - -def ttm12(series, df): - """ - - :param series: variables' name - :param df: dataframe - :return: ttm12 - """ - lag = pd.DataFrame() - for i in range(1, 12): - lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('permno')['%s' % series].shift(i) - result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] +\ - lag['%s4' % series] + lag['%s5' % series] + lag['%s6' % series] + lag['%s7' % series] +\ - lag['%s8' % series] + lag['%s9' % series] + lag['%s10' % series] + lag['%s11' % series] - return result - -print('TTM') -####################################################################################################################### -# Compustat Block # -####################################################################################################################### -comp = conn.raw_sql(""" - /*header info*/ - select c.gvkey, f.cusip, f.datadate, f.fyear, c.cik, substr(c.sic,1,2) as sic2, c.sic, c.naics, - - /*firm variables*/ - /*income statement*/ - f.sale, f.revt, f.cogs, f.xsga, f.dp, f.xrd, f.xad, f.ib, f.ebitda, - f.ebit, f.nopi, f.spi, f.pi, f.txp, f.ni, f.txfed, f.txfo, f.txt, f.xint, f.xpp, f.xacc, - - /*CF statement and others*/ - f.capx, f.oancf, f.dvt, f.ob, f.gdwlia, f.gdwlip, f.gwo, f.mib, f.oiadp, f.ivao, f.ivst, - - /*assets*/ - f.rect, f.act, f.che, f.ppegt, f.invt, f.at, f.aco, f.intan, f.ao, f.ppent, f.gdwl, f.fatb, f.fatl, - - /*liabilities*/ - f.lct, f.dlc, f.dltt, f.lt, f.dm, f.dcvt, f.cshrc, - f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, f.dltis, f.dltr, f.dlcch, - - /*equity and other*/ - f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, - f.dpc, f.ajex, f.tstkp, f.oibdp, f.capxv, f.dvpa, f.epspx, - - /*market*/ - abs(f.prcc_f) as prcc_f, abs(f.prcc_c) as prcc_c, f.dvc, f.prstkc, f.sstk, f.fopt, f.wcap - - from comp.funda as f - left join comp.company as c - on f.gvkey = c.gvkey - - /*get consolidated, standardized, industrial format statements*/ - where f.indfmt = 'INDL' - and f.datafmt = 'STD' - and f.popsrc = 'D' - and f.consol = 'C' - and f.datadate >= '01/01/1959' - """) - -# convert datadate to date fmt -comp['datadate'] = pd.to_datetime(comp['datadate']) - -# sort and clean up -comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() - -# clean up csho -comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) - -# calculate Compustat market equity -comp['mve_f'] = comp['csho'] * comp['prcc_f'] - -# do some clean up. several variables have lots of missing values -condlist = [comp['drc'].notna() & comp['drlt'].notna(), - comp['drc'].notna() & comp['drlt'].isnull(), - comp['drlt'].notna() & comp['drc'].isnull()] -choicelist = [comp['drc']+comp['drlt'], - comp['drc'], - comp['drlt']] -comp['dr'] = np.select(condlist, choicelist, default=np.nan) - -condlist = [comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].notna() & comp['dcpstk'] > comp['pstk'], - comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].isnull()] -choicelist = [comp['dcpstk']-comp['pstk'], - comp['dcpstk']] -comp['dc'] = np.select(condlist, choicelist, default=np.nan) -comp['dc'] = np.where(comp['dc'].isnull(), comp['dcvt'], comp['dc']) - -comp['xint0'] = np.where(comp['xint'].isnull(), 0, comp['xint']) -comp['xsga0'] = np.where(comp['xsga'].isnull, 0, 0) - -comp['ceq'] = np.where(comp['ceq'] == 0, np.nan, comp['ceq']) -comp['at'] = np.where(comp['at'] == 0, np.nan, comp['at']) -comp = comp.dropna(subset=['at']) -print('compustat') -####################################################################################################################### -# CRSP Block # -####################################################################################################################### -# Create a CRSP Subsample with Monthly Stock and Event Variables -# Restrictions will be applied later -# Select variables from the CRSP monthly stock and event datasets -crsp = conn.raw_sql(""" - select a.prc, a.ret, a.retx, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.date, a.permno, a.permco, - b.ticker, b.ncusip, b.shrcd, b.exchcd - from crsp.msf as a - left join crsp.msenames as b - on a.permno=b.permno - and b.namedt<=a.date - and a.date<=b.nameendt - where a.date >= '01/01/1959' - and b.exchcd between 1 and 3 - """) - -# change variable format to int -crsp[['permco', 'permno', 'shrcd', 'exchcd']] = crsp[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) -crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month - -crsp = crsp.dropna(subset=['prc']) -crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity - -# if Market Equity is Nan then let return equals to 0 -crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) -crsp['retx'] = np.where(crsp['me'].isnull(), 0, crsp['retx']) - -# impute me -crsp = crsp.sort_values(by=['permno', 'date']).drop_duplicates() -crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) - -# Aggregate Market Cap -''' -There are cases when the same firm (permco) has two or more securities (permno) at same date. -For the purpose of ME for the firm, we aggregated all ME for a given permco, date. -This aggregated ME will be assigned to the permno with the largest ME. -''' -# sum of me across different permno belonging to same permco a given date -crsp_summe = crsp.groupby(['monthend', 'permco'])['me'].sum().reset_index() -# largest mktcap within a permco/date -crsp_maxme = crsp.groupby(['monthend', 'permco'])['me'].max().reset_index() -# join by monthend/maxme to find the permno -crsp1 = pd.merge(crsp, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) -# drop me column and replace with the sum me -crsp1 = crsp1.drop(['me'], axis=1) -# join with sum of me to get the correct market cap info -crsp2 = pd.merge(crsp1, crsp_summe, how='inner', on=['monthend', 'permco']) -# sort by permno and date and also drop duplicates -crsp2 = crsp2.sort_values(by=['permno', 'monthend']).drop_duplicates() -print('crsp') -####################################################################################################################### -# CCM Block # -####################################################################################################################### -# merge CRSP and Compustat -# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/linking-databases/linking-crsp-and-compustat/ -ccm = conn.raw_sql(""" - select gvkey, lpermno as permno, linktype, linkprim, - linkdt, linkenddt - from crsp.ccmxpf_linktable - where substr(linktype,1,1)='L' - and (linkprim ='C' or linkprim='P') - """) - -ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) -ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) - -# if linkenddt is missing then set to today date -ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) - -# merge ccm and comp -ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) - -# we can only get the accounting data after the firm public their report -# for annual data, we use 5 or 6 months lagged data -ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) -ccm1['jdate'] = ccm1['datadate'] + MonthEnd(4) - -# set link date bounds -ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] - -# link comp and crsp -crsp2 = crsp2.rename(columns={'monthend': 'jdate'}) -data_rawa = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) - -# filter exchcd & shrcd -data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & - ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] - -# process Market Equity -''' -Note: me is CRSP market equity, mve_f is Compustat market equity. Please choose the me below. -''' -data_rawa['me'] = data_rawa['me']/1000 # CRSP ME -# data_rawa['me'] = data_rawa['mve_f'] # Compustat ME - -# there are some ME equal to zero since this company do not have price or shares data, we drop these observations -data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) -data_rawa = data_rawa.dropna(subset=['me']) - -# count single stock years -# data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() - -# deal with the duplicates -data_rawa.loc[data_rawa.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 -data_rawa = data_rawa[data_rawa['temp'].notna()] -data_rawa.loc[data_rawa.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 -data_rawa = data_rawa[data_rawa['temp'].notna()] - -data_rawa = data_rawa.sort_values(by=['permno', 'jdate']) -print('ccm') -####################################################################################################################### -# Annual Variables # -####################################################################################################################### -# stockholders' equity -data_rawa['se'] = np.where(data_rawa['seq'].isnull(), data_rawa['ceq']+data_rawa['pstk'], data_rawa['seq']) -data_rawa['se'] = np.where(data_rawa['se'].isnull(), data_rawa['at']-data_rawa['lt'], data_rawa['se']) - -data_rawa['txditc'] = data_rawa['txditc'].fillna(0) - -# preferrerd stock -data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) -data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) -data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps']) - -# book equity -data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] -data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) - -# acc -data_rawa['act_l1'] = data_rawa.groupby(['permno'])['act'].shift(1) -data_rawa['lct_l1'] = data_rawa.groupby(['permno'])['lct'].shift(1) - -condlist = [data_rawa['np'].isnull(), - data_rawa['act'].isnull() | data_rawa['lct'].isnull()] -choicelist = [((data_rawa['act']-data_rawa['lct'])-(data_rawa['act_l1']-data_rawa['lct_l1'])/(10*data_rawa['be'])), - (data_rawa['ib']-data_rawa['oancf'])/(10*data_rawa['be'])] -data_rawa['acc'] = np.select(condlist, - choicelist, - default=((data_rawa['act']-data_rawa['lct']+data_rawa['np'])- - (data_rawa['act_l1']-data_rawa['lct_l1']+data_rawa['np'].shift(1)))/(10*data_rawa['be'])) - -# agr -data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1) -data_rawa['agr'] = (data_rawa['at']-data_rawa['at_l1'])/data_rawa['at_l1'] - -# bm -# data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] - -# cfp -# condlist = [data_rawa['dp'].isnull(), -# data_rawa['ib'].isnull()] -# choicelist = [data_rawa['ib']/data_rawa['me'], -# np.nan] -# data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) - -# ep, checked from Hou and change 'ME' from compustat to crsp -#data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] -#data_rawa['ep_n'] = data_rawa['ib'] - -# ni -data_rawa['csho_l1'] = data_rawa.groupby(['permno'])['csho'].shift(1) -data_rawa['ajex_l1'] = data_rawa.groupby(['permno'])['ajex'].shift(1) -data_rawa['ni'] = np.where(data_rawa['gvkey'] != data_rawa['gvkey'].shift(1), - np.nan, - np.log(data_rawa['csho']*data_rawa['ajex']).replace(-np.inf, 0)- - np.log(data_rawa['csho_l1']*data_rawa['ajex_l1']).replace(-np.inf, 0)) - -# op: the formula seems different from Hou Page 74? -data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) -data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) -data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga']) - -condlist = [data_rawa['revt'].isnull(), data_rawa['be'].isnull()] -choicelist = [np.nan, np.nan] -data_rawa['op'] = np.select(condlist, choicelist, - default=(data_rawa['revt'] - data_rawa['cogs0'] - data_rawa['xsga0'] - data_rawa['xint0'])/data_rawa['be']) - - - -# rsup -data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) -# data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] - -# cash -data_rawa['cash'] = data_rawa['che']/data_rawa['at'] - -# lev -# data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] - -# sp, checked -# data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] -#data_rawa['sp_n'] = data_rawa['sale'] - -# rd_sale -data_rawa['rd_sale'] = data_rawa['xrd']/data_rawa['sale'] - -# rdm -# data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] - -# adm hxz adm, checked -# data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] - -# gma -data_rawa['gma'] = (data_rawa['revt']-data_rawa['cogs'])/data_rawa['at_l1'] - -# chcsho -data_rawa['chcsho'] = (data_rawa['csho']/data_rawa['csho_l1'])-1 - -# lgr -data_rawa['lt_l1'] = data_rawa.groupby(['permno'])['lt'].shift(1) -data_rawa['lgr'] = (data_rawa['lt']/data_rawa['lt_l1'])-1 - -# pctacc -data_rawa['che_l1'] = data_rawa.groupby(['permno'])['che'].shift(1) -data_rawa['dlc_l1'] = data_rawa.groupby(['permno'])['dlc'].shift(1) -data_rawa['txp_l1'] = data_rawa.groupby(['permno'])['txp'].shift(1) - -condlist = [data_rawa['ib']==0, - data_rawa['oancf'].isnull(), - data_rawa['oancf'].isnull() & data_rawa['ib']==0] -choicelist = [(data_rawa['ib']-data_rawa['oancf'])/0.01, - ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1']))- - ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1']- - ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))/data_rawa['ib'].abs(), - ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1'])) - - ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1'] - - ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))] -data_rawa['pctacc'] = np.select(condlist, choicelist, default=(data_rawa['ib']-data_rawa['oancf'])/data_rawa['ib'].abs()) - -# sgr -data_rawa['sgr'] = (data_rawa['sale']/data_rawa['sale_l1'])-1 - -# chato -data_rawa['at_l2'] = data_rawa.groupby(['permno'])['at'].shift(2) -data_rawa['chato'] = (data_rawa['sale']/((data_rawa['at']+data_rawa['at_l1'])/2))-\ - (data_rawa['sale_l1']/((data_rawa['at']+data_rawa['at_l2'])/2)) - -# chtx -data_rawa['txt_l1'] = data_rawa.groupby(['permno'])['txt'].shift(1) -data_rawa['chtx'] = (data_rawa['txt']-data_rawa['txt_l1'])/data_rawa['at_l1'] - -# noa,checked -data_rawa['noa'] = ((data_rawa['at']-data_rawa['che']-data_rawa['ivao'].fillna(0))- - (data_rawa['at']-data_rawa['dlc'].fillna(0)-data_rawa['dltt'].fillna(0)-data_rawa['mib'].fillna(0) - -data_rawa['pstk'].fillna(0)-data_rawa['ceq'])/data_rawa['at_l1']) - -# rna -data_rawa['noa_l1'] = data_rawa.groupby(['permno'])['noa'].shift(1) -data_rawa['rna'] = data_rawa['oiadp']/data_rawa['noa_l1'] - -# pm -data_rawa['pm'] = data_rawa['oiadp']/data_rawa['sale'] - -# ato -data_rawa['ato'] = data_rawa['sale']/data_rawa['noa_l1'] - -# depr -data_rawa['depr'] = data_rawa['dp']/data_rawa['ppent'] - -# invest -data_rawa['ppent_l1'] = data_rawa.groupby(['permno'])['ppent'].shift(1) -data_rawa['invt_l1'] = data_rawa.groupby(['permno'])['invt'].shift(1) - -data_rawa['invest'] = np.where(data_rawa['ppegt'].isnull(), ((data_rawa['ppent']-data_rawa['ppent_l1'])+ - (data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1'], - ((data_rawa['ppegt']-data_rawa['ppent_l1'])+(data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1']) - -# egr -data_rawa['ceq_l1'] = data_rawa.groupby(['permno'])['ceq'].shift(1) -data_rawa['egr'] = ((data_rawa['ceq']-data_rawa['ceq_l1'])/data_rawa['ceq_l1']) - -# cashdebt -data_rawa['cashdebt'] = (data_rawa['ib']+data_rawa['dp'])/((data_rawa['lt']+data_rawa['lt_l1'])/2) - -# rd -# if ((xrd/at)-(lag(xrd/lag(at))))/(lag(xrd/lag(at))) >.05 then rd=1 else rd=0 -data_rawa['xrd/at_l1'] = data_rawa['xrd']/data_rawa['at_l1'] -data_rawa['xrd/at_l1_l1'] = data_rawa.groupby(['permno'])['xrd/at_l1'].shift(1) -data_rawa['rd'] = np.where(((data_rawa['xrd']/data_rawa['at'])- - (data_rawa['xrd/at_l1_l1']))/data_rawa['xrd/at_l1_l1']>0.05, 1, 0) - -# roa -data_rawa['roa'] = data_rawa['ni']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# roe -data_rawa['roe'] = data_rawa['ib']/data_rawa['ceq_l1'] - -# dy -# data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] - -################## Added on 2020.07.28 ################## - -# roic -data_rawa['roic'] = (data_rawa['ebit'] - data_rawa['nopi'])/(data_rawa['ceq'] + data_rawa['lt'] - data_rawa['che']) - -# chinv -data_rawa['chinv'] = (data_rawa['invt'] - data_rawa['invt_l1'])/((data_rawa['at'] + data_rawa['at_l2'])/2) - -# pchsale_pchinvt -data_rawa['pchsale_pchinvt'] = ((data_rawa['sale'] - data_rawa['sale_l1'])/data_rawa['sale_l1'])\ - - ((data_rawa['invt']-data_rawa['invt_l1'])/data_rawa['invt_l1']) - -# pchsale_pchrect -data_rawa['rect_l1'] = data_rawa.groupby(['permno'])['rect'].shift(1) -data_rawa['pchsale_pchrect'] = ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale_l1'])\ - - ((data_rawa['rect']-data_rawa['rect_l1'])/data_rawa['rect_l1']) - -# pchgm_pchsale -data_rawa['cogs_l1'] = data_rawa.groupby(['permno'])['cogs'].shift(1) -data_rawa['pchgm_pchsale'] = (((data_rawa['sale']-data_rawa['cogs']) - - (data_rawa['sale_l1']-data_rawa['cogs_l1']))/(data_rawa['sale_l1']-data_rawa['cogs_l1']))\ - - ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale']) - -# pchsale_pchxsga -data_rawa['xsga_l1'] = data_rawa.groupby(['permno'])['xsga'].shift(1) -data_rawa['pchsale_pchxsga'] = ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale_l1'])\ - - ((data_rawa['xsga']-data_rawa['xsga_l1'])/data_rawa['xsga_l1']) - -# pchdepr -data_rawa['dp_l1'] = data_rawa.groupby(['permno'])['dp'].shift(1) -data_rawa['pchdepr'] = ((data_rawa['dp']/data_rawa['ppent'])-(data_rawa['dp_l1'] - /data_rawa['ppent_l1']))\ - / (data_rawa['dp_l1']/data_rawa['ppent']) - -# chadv -data_rawa['xad_l1'] = data_rawa.groupby(['permno'])['xad'].shift(1) -data_rawa['chadv'] = np.log(data_rawa['xad'] + 1) - np.log(data_rawa['xad_l1'] + 1) - -# pchcapx -data_rawa['capx_l1'] = data_rawa.groupby(['permno'])['capx'].shift(1) -data_rawa['pchcapx'] = (data_rawa['capx']-data_rawa['capx_l1'])/data_rawa['capx_l1'] - -# grcapx -data_rawa['capx_l2'] = data_rawa.groupby(['permno'])['capx'].shift(2) -data_rawa['grcapx'] = (data_rawa['capx']-data_rawa['capx_l2'])/data_rawa['capx_l2'] - -# grGW -data_rawa['gdwl_l1'] = data_rawa.groupby(['permno'])['gdwl'].shift(1) -data_rawa['grGW'] = (data_rawa['gdwl']-data_rawa['gdwl_l1'])/data_rawa['gdwl'] -condlist = [(data_rawa['gdwl']==0) | (data_rawa['gdwl'].isnull()), - (data_rawa['gdwl'].notna()) & (data_rawa['gdwl'] != 0) & (data_rawa['grGW'].isnull())] -choicelist = [0, 1] -data_rawa['grGW'] = np.select(condlist, choicelist, default=data_rawa['grGW']) - -# currat -data_rawa['currat'] = data_rawa['act']/data_rawa['lct'] - -# pchcurrat -data_rawa['pchcurrat'] = ((data_rawa['act']/data_rawa['lct'])-(data_rawa['act_l1']/data_rawa['lct_l1']))\ - /(data_rawa['act_l1']/data_rawa['lct_l1']) - -# quick -data_rawa['quick'] = (data_rawa['act']-data_rawa['invt'])/data_rawa['lct'] - -# pchquick -data_rawa['pchquick'] = ((data_rawa['act']-data_rawa['invt'])/data_rawa['lct'] - -(data_rawa['act_l1']-data_rawa['invt_l1'])/data_rawa['lct_l1'])\ - /((data_rawa['act_l1']-data_rawa['invt_l1'])/data_rawa['lct_l1']) - -# salecash -data_rawa['salecash'] = data_rawa['sale']/data_rawa['che'] - -# salerec -data_rawa['salerec']= data_rawa['sale']/data_rawa['rect'] - -# saleinv -data_rawa['saleinv'] = data_rawa['sale']/data_rawa['invt'] - -# pchsaleinv -data_rawa['pchsaleinv'] = ((data_rawa['sale']/data_rawa['invt'])-(data_rawa['sale_l1']/data_rawa['invt_l1']))\ - /(data_rawa['sale_l1']/data_rawa['invt_l1']) - -# realestate -data_rawa['realestate'] = (data_rawa['fatb']+data_rawa['fatl'])/data_rawa['ppegt'] -data_rawa['realestate'] = np.where(data_rawa['ppegt'].isnull(), - (data_rawa['fatb']+data_rawa['fatl'])/data_rawa['ppent'], data_rawa['realestate']) - -# obklg -data_rawa['obklg'] = data_rawa['ob']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# chobklg -data_rawa['ob_l1'] = data_rawa.groupby(['permno'])['ob'].shift(1) -data_rawa['chobklg'] = (data_rawa['ob'] - data_rawa['ob_l1'])/((data_rawa['at']+data_rawa['at_l1'])/2) - -# grltnoa -data_rawa['aco_l1'] = data_rawa.groupby(['permno'])['aco'].shift(1) -data_rawa['intan_l1'] = data_rawa.groupby(['permno'])['intan'].shift(1) -data_rawa['ao_l1'] = data_rawa.groupby(['permno'])['ao'].shift(1) -data_rawa['ap_l1'] = data_rawa.groupby(['permno'])['ap'].shift(1) -data_rawa['lco_l1'] = data_rawa.groupby(['permno'])['lco'].shift(1) -data_rawa['lo_l1'] = data_rawa.groupby(['permno'])['lo'].shift(1) -data_rawa['rect_l1'] = data_rawa.groupby(['permno'])['rect'].shift(1) - -data_rawa['grltnoa'] = ((data_rawa['rect']+data_rawa['invt']+data_rawa['ppent']+data_rawa['aco']+data_rawa['intan']+ - data_rawa['ao']-data_rawa['ap']-data_rawa['lco']-data_rawa['lo']) - -(data_rawa['rect_l1']+data_rawa['invt_l1']+data_rawa['ppent_l1']+data_rawa['aco_l1'] - +data_rawa['intan_l1']+data_rawa['ao_l1']-data_rawa['ap_l1']-data_rawa['lco_l1'] - -data_rawa['lo_l1']) - -(data_rawa['rect']-data_rawa['rect_l1']+data_rawa['invt']-data_rawa['invt_l1'] - +data_rawa['aco']-data_rawa['aco_l1'] - -(data_rawa['ap']-data_rawa['ap_l1']+data_rawa['lco']-data_rawa['lco_l1'])-data_rawa['dp']))\ - /((data_rawa['at']+data_rawa['at_l1'])/2) - -# conv -data_rawa['conv'] = data_rawa['dc']/data_rawa['dltt'] - -# chdrc -data_rawa['dr_l1'] = data_rawa.groupby(['permno'])['dr'].shift(1) -data_rawa['chdrc'] = (data_rawa['dr']-data_rawa['dr_l1'])/((data_rawa['at']+data_rawa['at_l1'])/2) - -# rdbias -data_rawa['xrd_l1'] = data_rawa.groupby(['permno'])['xrd'].shift(1) -data_rawa['rdbias'] = (data_rawa['xrd']/data_rawa['xrd_l1'])-1-data_rawa['ib']/data_rawa['ceq_l1'] - -# operprof -data_rawa['operprof'] = (data_rawa['revt']-data_rawa['cogs']-data_rawa['xsga0']-data_rawa['xint0'])/data_rawa['ceq_l1'] - -# cfroa -data_rawa['cfroa'] = data_rawa['oancf']/((data_rawa['at']+data_rawa['at_l1'])/2) -data_rawa['cfroa'] = np.where(data_rawa['oancf'].isnull(), - (data_rawa['ib'] + data_rawa['dp'])/((data_rawa['at']+data_rawa['at_l1'])/2), - data_rawa['cfroa']) - -# xrdint -data_rawa['xrdint'] = data_rawa['xrd']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# capxint -data_rawa['capxint'] = data_rawa['capx']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# xadint -data_rawa['xadint'] = data_rawa['xad']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# chpm -data_rawa['ib_l1'] = data_rawa.groupby(['permno'])['ib'].shift(1) -data_rawa['chpm'] = (data_rawa['ib']/data_rawa['sale'])-(data_rawa['ib_l1']/data_rawa['sale_l1']) - -# ala -data_rawa['ala'] = data_rawa['che']+0.75*(data_rawa['act']-data_rawa['che'])-\ - 0.5*(data_rawa['at']-data_rawa['act']-data_rawa['gdwl']-data_rawa['intan']) - -# alm -data_rawa['alm'] = data_rawa['ala']/(data_rawa['at']+data_rawa['prcc_f']*data_rawa['csho']-data_rawa['ceq']) - -# hire -data_rawa['emp_l1'] = data_rawa.groupby(['permno'])['emp'].shift(1) -data_rawa['hire'] = (data_rawa['emp'] - data_rawa['emp_l1'])/data_rawa['emp_l1'] -data_rawa['hire'] = np.where((data_rawa['emp'].isnull()) | (data_rawa['emp_l1'].isnull()), 0, data_rawa['hire']) - -# herf -data_rawa['sic'] = data_rawa['sic'].astype(int) -data_rawa['ffi49'] = ffi49(data_rawa) -data_rawa['ffi49'] = data_rawa['ffi49'].fillna(49) -data_rawa['ffi49'] = data_rawa['ffi49'].astype(int) -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['sale'].sum() -df_temp = df_temp.rename(columns={'sale': 'indsale'}) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) -data_rawa['herf'] = (data_rawa['sale']/data_rawa['indsale'])*(data_rawa['sale']/data_rawa['indsale']) -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['herf'].sum() -data_rawa = data_rawa.drop(['herf'], axis=1) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) - -################################## Added on 2020.10.29 ################################## -# Bmj -data_rawa['be_per'] = data_rawa['be'] / data_rawa['csho'] -data_rawa['bmj'] = data_rawa['be_per'] / data_rawa['prc'] -############### *Q*: used prc as share price from crsp ########## - -# Cp -data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] -#data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'] - -# Dp -###### *Q* difference return with without divident - -# Dur -# me = data_rawa['me_comp'] - - -# Ebp -data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) -data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) -data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] -data_rawa['f_asse'] = data_rawa['che'] -# net debt : = financial liabilities - financial assets. -data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] -data_rawa['ber'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa'] -#data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['ber']) / (data_rawa['n_debt']+data_rawa['me']) - - -# Em -#data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che'] -#data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'] - -############### Investment ############### -# Aci -data_rawa['ce'] = data_rawa['capx'] / data_rawa['sale'] -data_rawa['ce1'] = data_rawa['ce'].shift(1) -data_rawa['ce2'] = data_rawa['ce'].shift(2) -data_rawa['ce3'] = data_rawa['ce'].shift(3) -data_rawa['aci'] = data_rawa['ce']/ (data_rawa['ce1']+data_rawa['ce2']+data_rawa['ce3'])-1 - -# Cei -#data_rawa['lg_me'] = np.log(data_rawa['me']/data_rawa['me'].shift(6)) -#data_rawa['lg_ret'] = np.log(data_rawa['ret']*data_rawa['ret'].shift(1)*data_rawa['ret'].shift(2)*data_rawa['ret'].shift(3)*data_rawa['ret'].shift(5)*data_rawa['ret'].shift(6)) -#data_rawa['cei'] = data_rawa['lg_me'] - data_rawa['lg_ret'] - - -# Dac - - - -# dCoa -data_rawa['coa'] = data_rawa['act'] - data_rawa['che'] -data_rawa['dcoa'] = (data_rawa['coa']-data_rawa['coa'].shift(1)) / data_rawa['at'].shift(1) - - -# dBe -data_rawa['dBe'] = (data_rawa['ceq'] - data_rawa['ceq'].shift(1)) / data_rawa['at'].shift(1) - - -# dFnl & dFin -data_rawa['fna'] = data_rawa['ivst'] + data_rawa['ivao'] -data_rawa['fnl'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] - -data_rawa['d_dlc'] = data_rawa['dlc'] - data_rawa['dlc'].shift(1) -data_rawa['d_dlc'] = np.where(data_rawa['d_dlc'].isnull(), 0, data_rawa['d_dlc']) -data_rawa['d_pstk'] = data_rawa['pstk'] - data_rawa['pstk'].shift(1) -data_rawa['d_pstk'] = np.where(data_rawa['d_pstk'].isnull(), 0, data_rawa['d_pstk']) - -data_rawa['dfnl'] = (data_rawa['dltt']-data_rawa['dltt'].shift(1)) + data_rawa['d_dlc'] + data_rawa['d_pstk'] - -data_rawa['d_ivst'] = data_rawa['ivst'] - data_rawa['ivst'].shift(1) -data_rawa['d_ivst'] = np.where(data_rawa['d_ivst'].isnull(), 0, data_rawa['d_ivst']) -data_rawa['d_ivao'] = data_rawa['ivao'] - data_rawa['ivao'].shift(1) -data_rawa['d_ivao'] = np.where(data_rawa['d_ivao'].isnull(), 0, data_rawa['d_ivao']) - -data_rawa['dfna'] = data_rawa['d_ivst'] + data_rawa['d_ivao'] -data_rawa['dfin'] = data_rawa['dfna'] - data_rawa['dfnl'] - -data_rawa['dfin'] = data_rawa['dfin'] / data_rawa['at'].shift(1) -data_rawa['dfnl'] = data_rawa['dfnl'] / data_rawa['at'].shift(1) - - - - -# dIi -data_rawa['e_invt'] = (data_rawa['capxv'] + data_rawa['capxv'].shift(1))/2 -data_rawa['dinvt'] = (data_rawa['capxv'] - data_rawa['e_invt']) / data_rawa['e_invt'] - -data_rawa['ind'] = data_rawa['capxv'] -s = data_rawa.groupby(['jdate', 'sic2'])['ind'].sum() -data_rawa = pd.merge(data_rawa, s, on=['jdate', 'sic2']) -# new industry investment will be named as ind_y, cause it's been grouped by ind -data_rawa['e_ind'] = (data_rawa['ind_y'] + data_rawa['ind_y'].shift(1))/2 -data_rawa['dind'] = (data_rawa['ind_y']-data_rawa['e_ind']) / data_rawa['e_ind'] -data_rawa['dIi'] = data_rawa['dinvt'] - data_rawa['dind'] - -# dLno -data_rawa['dlno'] = (data_rawa['ppent']-data_rawa['ppent'].shift(1)) + (data_rawa['intan']-data_rawa['intan'].shift(1)) + (data_rawa['ao']-data_rawa['ao'].shift(1)) - (data_rawa['lo']-data_rawa['lo'].shift(1)) + data_rawa['dp'] -avg_at = [] -for i in range(data_rawa.shape[0]): - avg_at.append(data_rawa.loc[0:i, 'at'].mean()) -data_rawa['avg_at'] = pd.DataFrame(avg_at) -data_rawa['dlno'] = data_rawa['dlno'] / data_rawa['avg_at'] - - -# dNco -data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao'] -data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt'] -data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] -data_rawa['dnco'] = data_rawa['nco'] - data_rawa['nco'].shift(1) - - -# dNca -data_rawa['ivao_0'] = np.where(data_rawa['ivao'].isnull(), 0, data_rawa['ivao']) -data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) - -data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao_0'] -data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt_0'] -data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] -data_rawa['dnca'] = data_rawa['nco'] - data_rawa['nco'].shift(1) - - - -# dNoa -data_rawa['dlc_0'] = np.where(data_rawa['dlc'].isnull(), 0, data_rawa['dlc']) -data_rawa['mib_0'] = np.where(data_rawa['mib'].isnull(), 0, data_rawa['mib']) -data_rawa['pstk_0'] = np.where(data_rawa['pstk'].isnull(), 0, data_rawa['pstk']) - -data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] -data_rawa['op_lia'] = data_rawa['at'] - data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] -data_rawa['net_op'] = data_rawa['op_at'] - data_rawa['op_lia'] -data_rawa['dnoa'] = (data_rawa['net_op']-data_rawa['net_op'].shift(1))/ data_rawa['at'].shift(1) - - -# dPia -data_rawa['c_propty'] = data_rawa['ppegt'] - data_rawa['ppegt'].shift(1) -data_rawa['c_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) -data_rawa['dpia'] = (data_rawa['c_propty'] + data_rawa['c_invt']) / data_rawa['at'].shift(1) - - - - - -######### Profitability ########## -# Ato,repeated -#data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] - data_rawa['ivao_0'] -#data_rawa['op_lia'] = data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] -#data_rawa['noa'] = data_rawa['op_at'] - data_rawa['op_lia'] -#data_rawa['ato'] = data_rawa['sale'] / data_rawa['noa'].shift(1) - - -# Cla -data_rawa['d_rect'] = data_rawa['rect'] - data_rawa['rect'].shift(1) -data_rawa['d_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) -data_rawa['d_xpp'] = data_rawa['xpp'] - data_rawa['xpp'].shift(1) -data_rawa['d_dr'] = (data_rawa['drc']-data_rawa['drc'].shift(1)) + (data_rawa['drlt']-data_rawa['drlt'].shift(1)) -data_rawa['d_ap'] = data_rawa['ap'] - data_rawa['ap'].shift(1) -data_rawa['d_xacc'] = data_rawa['xacc'] - data_rawa['xacc'].shift(1) - -data_rawa['xrd_0'] = np.where(data_rawa['xrd'].isnull(), 0, data_rawa['xrd']) -data_rawa['d_rect_0'] = np.where(data_rawa['d_rect'].isnull(), 0, data_rawa['d_rect']) -data_rawa['d_invt_0'] = np.where(data_rawa['d_invt'].isnull(), 0, data_rawa['d_invt']) -data_rawa['d_xpp_0'] = np.where(data_rawa['d_xpp'].isnull(), 0, data_rawa['d_xpp']) -data_rawa['d_dr_0'] = np.where(data_rawa['d_dr'].isnull(), 0, data_rawa['d_dr']) -data_rawa['d_ap_0'] = np.where(data_rawa['d_ap'].isnull(), 0, data_rawa['d_ap']) -data_rawa['d_xacc_0'] = np.where(data_rawa['d_xacc'].isnull(), 0, data_rawa['d_xacc']) - -data_rawa['cla'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ - - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ - + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] -data_rawa['cla'] = data_rawa['cla'] / data_rawa['at'].shift(1) - - -# Cop -data_rawa['cop'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ - - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ - + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] -data_rawa['cop'] = data_rawa['cop'] / data_rawa['at'] - - -# Cto -data_rawa['cto'] = data_rawa['sale'] / data_rawa['at'].shift(1) - -#ir -''' -#First calculate r(t-5,t). Then rb(t-5,t) and use Bm to perform linear regression and get residue -''' -#r(t-5,t):sum ret from t-5 to t (which is calendar year t-6 to t-1) -lag = pd.DataFrame() -for i in range(1,6): - lag['ret%s' % i] = data_rawa.groupby(['permno'])['ret'].shift(i) - -data_rawa['ret5'] = lag['ret1']+lag['ret2']+lag['ret3']+lag['ret4']+lag['ret5'] - -#bm_t-5 (bm of year t-5) -#data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) - -#rB (five year log book return) -#Reference: jf_06 page8 by KENT DANIEL -#data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] - -#Regression and get ir -#First get unique datelist -#datelist = data_rawa['jdate'].unique() -#for date in datelist: -# temp = data_rawa['jdate' == date] -# n_row = temp.shape[0] -# index = temp.index -# X = pd.DataFrame() -# X['bm5'] = temp['bm5'] -# X['rB'] = temp['rB'] -# X['intercept'] = 1 -# X = X[['intercept','rB','bm5']] -# X = np.mat(X) -# Y = np.mat(temp[['ret5']]) - #These are residuals on one date -# res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) -# #put residuals back into data_rawa -# data_rawa.loc[index,'ir'] = res - -#nop -#data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] -#data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] -#data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] ) - -#ocp -#data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf']) -#data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] -#data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) - -#dwc -data_rawa['dwc'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc']) -#data_rawa['dwc'] = data_rawa['dwc']/data_rawa['at_l1'] - -#I/A -data_rawa['ia'] = (data_rawa['at']/data_rawa['at_l1'])-1 - -#Ig -data_rawa['capx_l1'] = data_rawa.groupby('permno')['capx'].shift(1) -data_rawa['ig'] = data_rawa['capx']/data_rawa['capx_l1'] - -#2Ig -data_rawa['capx_l2'] = data_rawa.groupby('permno')['capx'].shift(2) -data_rawa['2ig'] = data_rawa['capx']/data_rawa['capx_l2'] - -#Ivc -data_rawa['atAvg'] = (data_rawa['at']+data_rawa['at_l1'])/2 -data_rawa['ivc'] = data_rawa['invt'] / data_rawa['atAvg'] - -#Ndf -data_rawa['ndf'] = data_rawa['dltis'] - data_rawa['dltr'] + data_rawa['dlcch'] - -#nsi -data_rawa['sps'] = data_rawa['csho'] * data_rawa['ajex'] -data_rawa['sps_l1'] = data_rawa.groupby('permno')['sps'].shift(1) -data_rawa['nsi'] = np.log(data_rawa['sps']/data_rawa['sps_l1']) - -#oa -data_rawa['txp'] = np.where(data_rawa['txp'].isnull(), 0, data_rawa['txp']) -data_rawa['oa'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc'] - data_rawa['txp']) - data_rawa['dp'] - -#Poa -data_rawa['poa'] = data_rawa['oa']/data_rawa['ni'] - -#Ta -data_rawa['ta'] = data_rawa['dwc'] + data_rawa['dnco'] + data_rawa['dfin'] - -#Ol -data_rawa['ol'] = (data_rawa['cogs'] + data_rawa['xsga'])/data_rawa['at'] - -#etr -data_rawa['txtpi'] = data_rawa['txt'] / data_rawa['pi'] -data_rawa['txtpi_l1'] = data_rawa.groupby('permno')['txtpi'].shift(1) -data_rawa['txtpi_l2'] = data_rawa.groupby('permno')['txtpi'].shift(2) -data_rawa['txtpi_l3'] = data_rawa.groupby('permno')['txtpi'].shift(3) -data_rawa['deps'] = data_rawa['epspx']/(data_rawa['ajex'] * data_rawa['prcc_f']) -data_rawa['etr'] = (data_rawa['txtpi'] - (data_rawa['txtpi_l1'] + data_rawa['txtpi_l2'] + data_rawa['txtpi_l3'])/3) * data_rawa['deps'] - -print('annual') -####################################################################################################################### -# Compustat Quarterly Raw Info # -####################################################################################################################### -comp = conn.raw_sql(""" - /*header info*/ - select c.gvkey, f.cusip, f.datadate, f.fyearq, substr(c.sic,1,2) as sic2, c.sic, f.fqtr, f.rdq, - - /*income statement*/ - f.ibq, f.saleq, f.txtq, f.revtq, f.cogsq, f.xsgaq, f.revty, f.cogsy, f.saley, - - /*balance sheet items*/ - f.atq, f.actq, f.cheq, f.lctq, f.dlcq, f.ppentq, f.ppegtq, - - /*others*/ - abs(f.prccq) as prccq, abs(f.prccq)*f.cshoq as mveq_f, f.ceqq, f.seqq, f.pstkq, f.ltq, - f.pstkrq, f.gdwlq, f.intanq, f.mibq, f.oiadpq, f.ivaoq, - - /* v3 my formula add*/ - f.ajexq, f.cshoq, f.txditcq, f.npq, f.xrdy, f.xrdq, f.dpq, f.xintq, f.invtq, f.scstkcy, f.niq, - f.oancfy, f.dlttq, f.rectq, f.acoq, f.apq, f.lcoq, f.loq, f.aoq - - from comp.fundq as f - left join comp.company as c - on f.gvkey = c.gvkey - - /*get consolidated, standardized, industrial format statements*/ - where f.indfmt = 'INDL' - and f.datafmt = 'STD' - and f.popsrc = 'D' - and f.consol = 'C' - and f.datadate >= '01/01/1959' - """) - -# comp['cusip6'] = comp['cusip'].str.strip().str[0:6] -comp = comp.dropna(subset=['ibq']) - -# sort and clean up -comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() -comp['cshoq'] = np.where(comp['cshoq'] == 0, np.nan, comp['cshoq']) -comp['ceqq'] = np.where(comp['ceqq'] == 0, np.nan, comp['ceqq']) -comp['atq'] = np.where(comp['atq'] == 0, np.nan, comp['atq']) -comp = comp.dropna(subset=['atq']) - -# convert datadate to date fmt -comp['datadate'] = pd.to_datetime(comp['datadate']) - -# merge ccm and comp -ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) -ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) -ccm1['jdate'] = ccm1['datadate'] + MonthEnd(3) # we change quarterly lag here -# ccm1['jdate'] = ccm1['datadate']+MonthEnd(4) - -# set link date bounds -ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] - -# merge ccm2 and crsp2 -# crsp2['jdate'] = crsp2['monthend'] -data_rawq = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) - -# filter exchcd & shrcd -data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & - ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] - -# process Market Equity -''' -Note: me is CRSP market equity, mveq_f is Compustat market equity. Please choose the me below. -''' -data_rawq['me'] = data_rawq['me']/1000 # CRSP ME -# data_rawq['me'] = data_rawq['mveq_f'] # Compustat ME - -# there are some ME equal to zero since this company do not have price or shares data, we drop these observations -data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) -data_rawq = data_rawq.dropna(subset=['me']) - -# count single stock years -# data_rawq['count'] = data_rawq.groupby(['gvkey']).cumcount() - -# deal with the duplicates -data_rawq.loc[data_rawq.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 -data_rawq = data_rawq[data_rawq['temp'].notna()] -data_rawq.loc[data_rawq.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 -data_rawq = data_rawq[data_rawq['temp'].notna()] - -data_rawq = data_rawq.sort_values(by=['permno', 'jdate']) -print('quarterly raw') -####################################################################################################################### -# Quarterly Variables # -####################################################################################################################### -# prepare be -data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) -data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) - -# dy -# data_rawq['me_l1'] = data_rawq.groupby(['permno'])['me'].shift(1) -# data_rawq['retdy'] = data_rawq['ret'] - data_rawq['retx'] -# data_rawq['mdivpay'] = data_rawq['retdy']*data_rawq['me_l1'] -# -# data_rawq['dy'] = ttm12(series='mdivpay', df=data_rawq)/data_rawq['me'] - -# chtx -data_rawq['txtq_l4'] = data_rawq.groupby(['permno'])['txtq'].shift(4) -data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) -data_rawq['chtx'] = (data_rawq['txtq']-data_rawq['txtq_l4'])/data_rawq['atq_l4'] - -# roa -data_rawq['atq_l1'] = data_rawq.groupby(['permno'])['atq'].shift(1) -data_rawq['roa'] = data_rawq['ibq']/data_rawq['atq_l1'] - -# cash -data_rawq['cash'] = data_rawq['cheq']/data_rawq['atq'] - -# acc -data_rawq['actq_l4'] = data_rawq.groupby(['permno'])['actq'].shift(4) -data_rawq['lctq_l4'] = data_rawq.groupby(['permno'])['lctq'].shift(4) -data_rawq['npq_l4'] = data_rawq.groupby(['permno'])['npq'].shift(4) -condlist = [data_rawq['npq'].isnull(), - data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] -choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/(10*data_rawq['beq']), - np.nan] -data_rawq['acc'] = np.select(condlist, choicelist, - default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])- - (data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/(10*data_rawq['beq'])) - -# bm -# data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] - -# cfp -data_rawq['ibq4'] = ttm4('ibq', data_rawq) -data_rawq['dpq4'] = ttm4('dpq', data_rawq) -# data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), -# data_rawq['ibq4']/data_rawq['me'], -# (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) - -# ep -# data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] - -# agr -data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'] - -# ni -data_rawq['cshoq_l4'] = data_rawq.groupby(['permno'])['cshoq'].shift(4) -data_rawq['ajexq_l4'] = data_rawq.groupby(['permno'])['ajexq'].shift(4) -data_rawq['ni'] = np.where(data_rawq['cshoq'].isnull(), np.nan, - np.log(data_rawq['cshoq']*data_rawq['ajexq']).replace(-np.inf, 0)-np.log(data_rawq['cshoq_l4']*data_rawq['ajexq_l4'])) - -# op -data_rawq['xintq0'] = np.where(data_rawq['xintq'].isnull(), 0, data_rawq['xintq']) -data_rawq['xsgaq0'] = np.where(data_rawq['xsgaq'].isnull(), 0, data_rawq['xsgaq']) -data_rawq['beq_l4'] = data_rawq.groupby(['permno'])['beq'].shift(4) - -data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] - -# csho -data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1 - -# cashdebt -data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4) -data_rawq['cashdebt'] = (ttm4('ibq', data_rawq) + ttm4('dpq', data_rawq))/((data_rawq['ltq']+data_rawq['ltq_l4'])/2) - -# rd -data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) -data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4']) - -data_rawq['xrdq4/atq_l4'] = data_rawq['xrdq4']/data_rawq['atq_l4'] -data_rawq['xrdq4/atq_l4_l4'] = data_rawq.groupby(['permno'])['xrdq4/atq_l4'].shift(4) -data_rawq['rd'] = np.where(((data_rawq['xrdq4']/data_rawq['atq'])-data_rawq['xrdq4/atq_l4_l4'])/data_rawq['xrdq4/atq_l4_l4']>0.05, 1, 0) - -# pctacc -condlist = [data_rawq['npq'].isnull(), - data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] -choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/abs(ttm4('ibq', data_rawq)), np.nan] -data_rawq['pctacc'] = np.select(condlist, choicelist, - default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/ - abs(ttm4('ibq', data_rawq))) - -# gma -data_rawq['revtq4'] = ttm4('revtq', data_rawq) -data_rawq['cogsq4'] = ttm4('cogsq', data_rawq) -data_rawq['gma'] = (data_rawq['revtq4']-data_rawq['cogsq4'])/data_rawq['atq_l4'] - -# lev -# data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] - -# rdm -# data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] - -# sgr -data_rawq['saleq4'] = ttm4('saleq', data_rawq) -data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) - -data_rawq['saleq4_l4'] = data_rawq.groupby(['permno'])['saleq4'].shift(4) -data_rawq['sgr'] = (data_rawq['saleq4']/data_rawq['saleq4_l4'])-1 - -# sp -# data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] - -# invest -data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) -data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) -data_rawq['ppegtq_l4'] = data_rawq.groupby(['permno'])['ppegtq'].shift(4) - -data_rawq['invest'] = np.where(data_rawq['ppegtq'].isnull(), ((data_rawq['ppentq']-data_rawq['ppentq_l4'])+ - (data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4'], - ((data_rawq['ppegtq']-data_rawq['ppegtq_l4'])+(data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4']) - -# rd_sale -data_rawq['rd_sale'] = data_rawq['xrdq4']/data_rawq['saleq4'] - -# lgr -data_rawq['lgr'] = (data_rawq['ltq']/data_rawq['ltq_l4'])-1 - -# depr -data_rawq['depr'] = ttm4('dpq', data_rawq)/data_rawq['ppentq'] - -# egr -data_rawq['ceqq_l4'] = data_rawq.groupby(['permno'])['ceqq'].shift(4) -data_rawq['egr'] = (data_rawq['ceqq']-data_rawq['ceqq_l4'])/data_rawq['ceqq_l4'] - -# chpm -data_rawq['ibq4_l1'] = data_rawq.groupby(['permno'])['ibq4'].shift(1) -data_rawq['saleq4_l1'] = data_rawq.groupby(['permno'])['saleq4'].shift(1) - -data_rawq['chpm'] = (data_rawq['ibq4']/data_rawq['saleq4'])-(data_rawq['ibq4_l1']/data_rawq['saleq4_l1']) - -# chato -data_rawq['atq_l8'] = data_rawq.groupby(['permno'])['atq'].shift(8) -data_rawq['chato'] = (data_rawq['saleq4']/((data_rawq['atq']+data_rawq['atq_l4'])/2))-(data_rawq['saleq4_l4']/((data_rawq['atq_l4']+data_rawq['atq_l8'])/2)) - -# noa -data_rawq['ivaoq'] = np.where(data_rawq['ivaoq'].isnull(), 0, 1) -data_rawq['dlcq'] = np.where(data_rawq['dlcq'].isnull(), 0, 1) -data_rawq['dlttq'] = np.where(data_rawq['dlttq'].isnull(), 0, 1) -data_rawq['mibq'] = np.where(data_rawq['mibq'].isnull(), 0, 1) -data_rawq['pstkq'] = np.where(data_rawq['pstkq'].isnull(), 0, 1) -data_rawq['noa'] = (data_rawq['atq']-data_rawq['cheq']-data_rawq['ivaoq'])-\ - (data_rawq['atq']-data_rawq['dlcq']-data_rawq['dlttq']-data_rawq['mibq']-data_rawq['pstkq']-data_rawq['ceqq'])/data_rawq['atq_l4'] - -# rna -data_rawq['noa_l4'] = data_rawq.groupby(['permno'])['noa'].shift(4) -data_rawq['rna'] = data_rawq['oiadpq']/data_rawq['noa_l4'] - -# pm -data_rawq['pm'] = data_rawq['oiadpq']/data_rawq['saleq'] - -# ato -data_rawq['ato'] = data_rawq['saleq']/data_rawq['noa_l4'] - -# roe -data_rawq['ceqq_l1'] = data_rawq.groupby(['permno'])['ceqq'].shift(1) -data_rawq['roe'] = data_rawq['ibq']/data_rawq['ceqq_l1'] - -################################## New Added ################################## - -# grltnoa -data_rawq['rectq_l4'] = data_rawq.groupby(['permno'])['rectq'].shift(4) -data_rawq['acoq_l4'] = data_rawq.groupby(['permno'])['acoq'].shift(4) -data_rawq['apq_l4'] = data_rawq.groupby(['permno'])['apq'].shift(4) -data_rawq['lcoq_l4'] = data_rawq.groupby(['permno'])['lcoq'].shift(4) -data_rawq['loq_l4'] = data_rawq.groupby(['permno'])['loq'].shift(4) -data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) -data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) -data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) - -data_rawq['grltnoa'] = ((data_rawq['rectq']+data_rawq['invtq']+data_rawq['ppentq']+data_rawq['acoq']+data_rawq['intanq']+ - data_rawq['aoq']-data_rawq['apq']-data_rawq['lcoq']-data_rawq['loq'])- - (data_rawq['rectq_l4']+data_rawq['invtq_l4']+data_rawq['ppentq_l4']+data_rawq['acoq_l4']-data_rawq['apq_l4']-data_rawq['lcoq_l4']-data_rawq['loq_l4'])-\ - (data_rawq['rectq']-data_rawq['rectq_l4']+data_rawq['invtq']-data_rawq['invtq_l4']+data_rawq['acoq']- - (data_rawq['apq']-data_rawq['apq_l4']+data_rawq['lcoq']-data_rawq['lcoq_l4'])- - ttm4('dpq', data_rawq)))/((data_rawq['atq']+data_rawq['atq_l4'])/2) - -# scal -# condlist = [data_rawq['seqq'].isnull(), -# data_rawq['seqq'].isnull() & (data_rawq['ceqq'].isnull() | data_rawq['pstk'].isnull())] -# choicelist = [data_rawq['ceqq']+data_rawq['pstk'], -# data_rawq['atq']-data_rawq['ltq']] -# data_rawq['scal'] = np.select(condlist, choicelist, default=data_rawq['seqq']) - -# ala -data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ - 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) - -# alm -# data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) - -# rsup -data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4) -# data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'] - -# stdsacc -data_rawq['actq_l1'] = data_rawq.groupby(['permno'])['actq'].shift(1) -data_rawq['cheq_l1'] = data_rawq.groupby(['permno'])['cheq'].shift(1) -data_rawq['lctq_l1'] = data_rawq.groupby(['permno'])['lctq'].shift(1) -data_rawq['dlcq_l1'] = data_rawq.groupby(['permno'])['dlcq'].shift(1) - -data_rawq['sacc'] = ((data_rawq['actq']-data_rawq['actq_l1'] - (data_rawq['cheq']-data_rawq['cheq_l1'])) - -((data_rawq['lctq']-data_rawq['lctq_l1'])-(data_rawq['dlcq']-data_rawq['dlcq_l1'])))/data_rawq['saleq'] -data_rawq['sacc'] = np.where(data_rawq['saleq']<=0, ((data_rawq['actq']-data_rawq['actq_l1'] - (data_rawq['cheq']-data_rawq['cheq_l1'])) - -((data_rawq['lctq']-data_rawq['lctq_l1'])-(data_rawq['dlcq']-data_rawq['dlcq_l1'])))/0.01, data_rawq['sacc']) - - -def chars_std(start, end, df, chars): - """ - - :param start: Order of starting lag - :param end: Order of ending lag - :param df: Dataframe - :param chars: lag chars - :return: std of factor - """ - lag = pd.DataFrame() - lag_list = [] - for i in range(start, end): - lag['chars_l%s' % i] = df.groupby(['permno'])['%s' % chars].shift(i) - lag_list.append('chars_l%s' % i) - result = lag[lag_list].std(axis=1) - return result - -data_rawq['stdacc'] = chars_std(0, 16, data_rawq, 'sacc') - -# sgrvol -# data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') - -# roavol -data_rawq['roavol'] = chars_std(0, 16, data_rawq, 'roa') - -# stdcf -data_rawq['scf'] = (data_rawq['ibq']/data_rawq['saleq']) - data_rawq['sacc'] -data_rawq['scf'] = np.where(data_rawq['saleq']<=0, (data_rawq['ibq']/0.01) - data_rawq['sacc'], data_rawq['sacc']) - -data_rawq['stdcf'] = chars_std(0, 16, data_rawq, 'scf') - -# cinvest -data_rawq['ppentq_l1'] = data_rawq.groupby(['permno'])['ppentq'].shift(1) -data_rawq['ppentq_l2'] = data_rawq.groupby(['permno'])['ppentq'].shift(2) -data_rawq['ppentq_l3'] = data_rawq.groupby(['permno'])['ppentq'].shift(3) -data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) -data_rawq['saleq_l1'] = data_rawq.groupby(['permno'])['saleq'].shift(1) -data_rawq['saleq_l2'] = data_rawq.groupby(['permno'])['saleq'].shift(2) -data_rawq['saleq_l3'] = data_rawq.groupby(['permno'])['saleq'].shift(3) - -data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / data_rawq['saleq_l1'] -data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / data_rawq['saleq_l2'] -data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / data_rawq['saleq_l3'] - -data_rawq['cinvest'] = ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / data_rawq['saleq'])\ - -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)) - -data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / 0.01 -data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / 0.01 -data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / 0.01 - -data_rawq['cinvest'] = np.where(data_rawq['saleq']<=0, ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / 0.01) - -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)), data_rawq['cinvest']) - -data_rawq = data_rawq.drop(['c_temp1', 'c_temp2', 'c_temp3'], axis=1) - -# nincr -data_rawq['ibq_l1'] = data_rawq.groupby(['permno'])['ibq'].shift(1) -data_rawq['ibq_l2'] = data_rawq.groupby(['permno'])['ibq'].shift(2) -data_rawq['ibq_l3'] = data_rawq.groupby(['permno'])['ibq'].shift(3) -data_rawq['ibq_l4'] = data_rawq.groupby(['permno'])['ibq'].shift(4) -data_rawq['ibq_l5'] = data_rawq.groupby(['permno'])['ibq'].shift(5) -data_rawq['ibq_l6'] = data_rawq.groupby(['permno'])['ibq'].shift(6) -data_rawq['ibq_l7'] = data_rawq.groupby(['permno'])['ibq'].shift(7) -data_rawq['ibq_l8'] = data_rawq.groupby(['permno'])['ibq'].shift(8) - -data_rawq['nincr_temp1'] = np.where(data_rawq['ibq'] > data_rawq['ibq_l1'], 1, 0) -data_rawq['nincr_temp2'] = np.where(data_rawq['ibq_l1'] > data_rawq['ibq_l2'], 1, 0) -data_rawq['nincr_temp3'] = np.where(data_rawq['ibq_l2'] > data_rawq['ibq_l3'], 1, 0) -data_rawq['nincr_temp4'] = np.where(data_rawq['ibq_l3'] > data_rawq['ibq_l4'], 1, 0) -data_rawq['nincr_temp5'] = np.where(data_rawq['ibq_l4'] > data_rawq['ibq_l5'], 1, 0) -data_rawq['nincr_temp6'] = np.where(data_rawq['ibq_l5'] > data_rawq['ibq_l6'], 1, 0) -data_rawq['nincr_temp7'] = np.where(data_rawq['ibq_l6'] > data_rawq['ibq_l7'], 1, 0) -data_rawq['nincr_temp8'] = np.where(data_rawq['ibq_l7'] > data_rawq['ibq_l8'], 1, 0) - -data_rawq['nincr'] = (data_rawq['nincr_temp1'] - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']*data_rawq['nincr_temp8'])) - -data_rawq = data_rawq.drop(['ibq_l1', 'ibq_l2', 'ibq_l3', 'ibq_l4', 'ibq_l5', 'ibq_l6', 'ibq_l7', 'ibq_l8', 'nincr_temp1', - 'nincr_temp2', 'nincr_temp3', 'nincr_temp4', 'nincr_temp5', 'nincr_temp6', 'nincr_temp7', - 'nincr_temp8'], axis=1) - -# performance score -data_rawq['niq4'] = ttm4(series='niq', df=data_rawq) -data_rawq['niq4_l4'] = data_rawq.groupby(['permno'])['niq4'].shift(4) -data_rawq['dlttq_l4'] = data_rawq.groupby(['permno'])['dlttq'].shift(4) -data_rawq['p_temp1'] = np.where(data_rawq['niq4']>0, 1, 0) -data_rawq['p_temp2'] = np.where(data_rawq['oancfy']>0, 1, 0) -data_rawq['p_temp3'] = np.where(data_rawq['niq4']/data_rawq['atq']>data_rawq['niq4_l4']/data_rawq['atq_l4'], 1, 0) -data_rawq['p_temp4'] = np.where(data_rawq['oancfy']>data_rawq['niq4'], 1, 0) -data_rawq['p_temp5'] = np.where(data_rawq['dlttq']/data_rawq['atq'] data_rawq['actq_l4']/data_rawq['lctq_l4'], 1, 0) -data_rawq['cogsq4_l4'] = data_rawq.groupby(['permno'])['cogsq4'].shift(4) -data_rawq['p_temp7'] = np.where((data_rawq['saleq4']-data_rawq['cogsq4']/data_rawq['saleq4'])>(data_rawq['saleq4_l4']-data_rawq['cogsq4_l4']/data_rawq['saleq4_l4']), 1, 0) -data_rawq['p_temp8'] = np.where(data_rawq['saleq4']/data_rawq['atq']>data_rawq['saleq4_l4']/data_rawq['atq_l4'], 1, 0) -data_rawq['p_temp9'] = np.where(data_rawq['scstkcy']==0, 1, 0) - -data_rawq['pscore'] = data_rawq['p_temp1']+data_rawq['p_temp2']+data_rawq['p_temp3']+data_rawq['p_temp4']\ - +data_rawq['p_temp5']+data_rawq['p_temp6']+data_rawq['p_temp7']+data_rawq['p_temp8']\ - +data_rawq['p_temp9'] - -data_rawq = data_rawq.drop(['p_temp1', 'p_temp2', 'p_temp3', 'p_temp4', 'p_temp5', 'p_temp6', 'p_temp7', 'p_temp8', - 'p_temp9'], axis=1) - -################################## Added on 2020.10.29 ################################## -#Iaq -data_rawq['atqlag'] = ttm4('atq',data_rawq) -data_rawq['iaq'] = (data_rawq['atq']/data_rawq['atqlag'])-1 - -#Almq -data_rawq['intanq'] = np.where(data_rawq['intanq'].isnull(), 0, data_rawq['intanq']) -data_rawq['qal'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq']) + 0.5*(data_rawq['atq'] - data_rawq['actq'] - data_rawq['intanq']) -data_rawq['mveqa'] = data_rawq['atq'] + data_rawq['mveq_f'] - data_rawq['ceqq'] -data_rawq['mveqa_1'] = data_rawq.groupby(['permno'])['mveqa'].shift(1) -data_rawq['almq'] = data_rawq['qal']/data_rawq['mveqa_1'] - -#Olq, needs atq -data_rawq['olq'] = (data_rawq['cogsq'] + data_rawq['xsgaq'])/data_rawq['atq'] - -# rds -data_rawq['rds'] = data_rawq['xrdq4']/data_rawq['saleq'] - -print('quarterly variables') -####################################################################################################################### -# Momentum # -####################################################################################################################### -crsp_mom = conn.raw_sql(""" - select permno, date, ret, retx, prc, shrout, vol - from crsp.msf - where date >= '01/01/1959' - """) - -crsp_mom['permno'] = crsp_mom['permno'].astype(int) -crsp_mom['date'] = pd.to_datetime(crsp_mom['date']) -crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) -crsp_mom = crsp_mom.dropna(subset=['ret', 'retx', 'prc']) - -# add delisting return -dlret = conn.raw_sql(""" - select permno, dlret, dlstdt - from crsp.msedelist - """) - -dlret.permno = dlret.permno.astype(int) -dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) -dlret['jdate'] = dlret['dlstdt'] + MonthEnd(0) - -# merge delisting return to crsp return -crsp_mom = pd.merge(crsp_mom, dlret, how='left', on=['permno', 'jdate']) -crsp_mom['dlret'] = crsp_mom['dlret'].fillna(0) -crsp_mom['ret'] = crsp_mom['ret'].fillna(0) -crsp_mom['retadj'] = (1 + crsp_mom['ret']) * (1 + crsp_mom['dlret']) - 1 -crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity -crsp_mom['retx'] = np.where(crsp_mom['me'].isnull(), 0, crsp_mom['retx']) -crsp_mom = crsp_mom.drop(['dlret', 'dlstdt'], axis=1)#delete prc,shrout - -#Seasonality - -#Rla -crsp_mom['rla'] = crsp_mom.groupby(['permno'])['ret'].shift(12) - -#Rln -lag = pd.DataFrame() -result = 0 -for i in range(1, 12): - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - result = result + lag['mom%s' % i] -crsp_mom['rln'] = result/11 - -#R[2,5]a -#R[2,5]n -lag = pd.DataFrame() -result = 0 -for i in range(13,61): - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - if i not in [24,36,48,60]: - result = result + lag['mom%s' % i] - -crsp_mom['r25a'] = (lag['mom24']+lag['mom36']+lag['mom48']+lag['mom60'])/4 -crsp_mom['r25n'] = result/44 - -#R[6,10]a -#R[6,10]n -lag = pd.DataFrame() -result = 0 -for i in range(61,121): - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - if i not in [72,84,96,108,120]: - result = result + lag['mom%s' % i] - -crsp_mom['r610a'] = (lag['mom72']+lag['mom84']+lag['mom96']+lag['mom108']+lag['mom120'])/5 -crsp_mom['r610n'] = result/55 - -#R[11,15]a -lag = pd.DataFrame() -result = 0 -for i in [132,144,156,168,180]: - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - result = result + lag['mom%s' % i] -crsp_mom['r1115a'] = result/5 - -#R[16,20]a -lag = pd.DataFrame() -result = 0 -for i in [192,204,216,228,240]: - lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) - result = result + lag['mom%s' % i] -crsp_mom['r1620a'] = result/5 - - -def mom(start, end, df): - """ - :param start: Order of starting lag - :param end: Order of ending lag - :param df: Dataframe - :return: Momentum factor - """ - lag = pd.DataFrame() - result = 1 - for i in range(start, end): - lag['mom%s' % i] = df.groupby(['permno'])['ret'].shift(i) - result = result * (1+lag['mom%s' % i]) - result = result - 1 - return result - - -crsp_mom['mom60m'] = mom(12, 60, crsp_mom) -crsp_mom['mom12m'] = mom(1, 12, crsp_mom) -crsp_mom['mom1m'] = crsp_mom['ret'] -crsp_mom['mom6m'] = mom(1, 6, crsp_mom) -crsp_mom['mom36m'] = mom(1, 36, crsp_mom) -crsp_mom['seas1a'] = crsp_mom.groupby(['permno'])['ret'].shift(11) - -crsp_mom['vol_l1'] = crsp_mom.groupby(['permno'])['vol'].shift(1) -crsp_mom['vol_l2'] = crsp_mom.groupby(['permno'])['vol'].shift(2) -crsp_mom['vol_l3'] = crsp_mom.groupby(['permno'])['vol'].shift(3) -crsp_mom['prc_l2'] = crsp_mom.groupby(['permno'])['prc'].shift(2) -crsp_mom['dolvol'] = np.log(crsp_mom['vol_l2']*crsp_mom['prc_l2']).replace([np.inf, -np.inf], np.nan) -crsp_mom['turn'] = ((crsp_mom['vol_l1']+crsp_mom['vol_l2']+crsp_mom['vol_l3'])/3)/crsp_mom['shrout'] - -# dy -crsp_mom['me_l1'] = crsp_mom.groupby(['permno'])['me'].shift(1) -crsp_mom['retdy'] = crsp_mom['ret'] - crsp_mom['retx'] -crsp_mom['mdivpay'] = crsp_mom['retdy']*crsp_mom['me_l1'] - -crsp_mom['dy'] = ttm12(series='mdivpay', df=crsp_mom)/crsp_mom['me'] - -# def moms(start, end, df): -# """ -# -# :param start: Order of starting lag -# :param end: Order of ending lag -# :param df: Dataframe -# :return: Momentum factor -# """ -# lag = pd.DataFrame() -# result = 1 -# for i in range(start, end): -# lag['moms%s' % i] = df.groupby['permno']['ret'].shift(i) -# result = result + lag['moms%s' % i] -# result = result/11 -# return result -# -# -# crsp_mom['moms12m'] = moms(1, 12, crsp_mom) - -# populate the chars to monthly -print('momentum') -# data_rawa -data_rawa = data_rawa.drop(['date', 'ret', 'retx', 'me'], axis=1) -data_rawa = pd.merge(crsp_mom, data_rawa, how='left', on=['permno', 'jdate']) -data_rawa['datadate'] = data_rawa.groupby(['permno'])['datadate'].fillna(method='ffill') -data_rawa = data_rawa.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') -data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & - ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] -print('data_rawa') -# data_rawq -data_rawq = data_rawq.drop(['date', 'ret', 'retx', 'me'], axis=1) -data_rawq = pd.merge(crsp_mom, data_rawq, how='left', on=['permno', 'jdate']) -data_rawq['datadate'] = data_rawq.groupby(['permno'])['datadate'].fillna(method='ffill') -data_rawq = data_rawq.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') -data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & - ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] -print('data_rawq') -####################################################################################################################### -# Monthly ME # -####################################################################################################################### - -######################################## -# Annual # -######################################## - -# bm -data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] -#data_rawa['bm_n'] = data_rawa['be'] - -# bm_ia -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() -df_temp = df_temp.rename(columns={'bm': 'bm_ind'}) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) -data_rawa['bm_ia'] = data_rawa['bm']/data_rawa['bm_ind'] - -# me_ia -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['me'].mean() -df_temp = df_temp.rename(columns={'me': 'me_ind'}) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) -data_rawa['me_ia'] = data_rawa['me']/data_rawa['me_ind'] - -# cfp -condlist = [data_rawa['dp'].isnull(), - data_rawa['ib'].isnull()] -choicelist = [data_rawa['ib']/data_rawa['me'], - np.nan] -data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) - -# ep, checked from Hou and change 'ME' from compustat to crsp,checked -data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] -#data_rawa['ep_n'] = data_rawa['ib'] - -# rsup -# data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) -data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] - -# lev -data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] - -# sp, checked -data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] -#data_rawa['sp_n'] = data_rawa['sale'] - -# rdm -data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] - -# adm hxz adm,checked -data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] - -# dy -data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] - -# Cp -#data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] -data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'] - -# Ebp -#data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) -#data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) -#data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] -#data_rawa['f_asse'] = data_rawa['che'] -# net debt : = financial liabilities - financial assets. -#data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] -#data_rawa['ber'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa'] -data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['ber']) / (data_rawa['n_debt']+data_rawa['me']) - -# Em -data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che'] -data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'] - -# Cei -data_rawa['lg_me'] = np.log(data_rawa['me']/data_rawa['me'].shift(6)) -data_rawa['lg_ret'] = np.log(data_rawa['ret']*data_rawa['ret'].shift(1)*data_rawa['ret'].shift(2)*data_rawa['ret'].shift(3)*data_rawa['ret'].shift(5)*data_rawa['ret'].shift(6)) -data_rawa['cei'] = data_rawa['lg_me'] - data_rawa['lg_ret'] - -#nop -data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] -data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] -data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] ) - -#ocp -data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf']) -data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] -data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) - -#bm_t-5 (bm of year t-5) -data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) - -#rB (five year log book return) -#Reference: jf_06 page8 by KENT DANIEL -data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] - -#Regression and get ir -#First get unique datelist -datelist = data_rawa['jdate'].unique() -for date in datelist: - temp = data_rawa[data_rawa['jdate'] == date] - n_row = temp.shape[0] - index = temp.index - X = pd.DataFrame() - X['bm5'] = temp['bm5'] - X['rB'] = temp['rB'] - X['intercept'] = 1 - X = X[['intercept','rB','bm5']] - X = np.mat(X) - Y = np.mat(temp[['ret5']]) - #These are residuals on one date - res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) - #put residuals back into data_rawa - data_rawa.loc[index,'ir'] = res - -# Annual Accounting Variables -chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', - 'sic', 'retadj', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', 'rsup', 'cash', 'chcsho', - 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'adm', 'sgr', 'sp', 'invest', 'roe', - 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'chato', 'chtx', 'noa', 'rna', 'pm', 'ato', 'dy', - 'roic', 'chinv', 'pchsale_pchinvt', 'pchsale_pchrect', 'pchgm_pchsale', 'pchsale_pchxsga', - 'pchdepr', 'chadv', 'pchcapx', 'grcapx', 'grGW', 'currat', 'pchcurrat', 'quick', 'pchquick', - 'salecash', 'salerec', 'saleinv', 'pchsaleinv', 'realestate', 'obklg', 'chobklg', 'grltnoa', - 'conv', 'chdrc', 'rdbias', 'operprof', 'capxint', 'xadint', 'chpm', 'ala', 'alm', - 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'hire', 'herf', 'bm_ia', - 'me_ia', 'bmj','cp', 'ebp', 'em', 'dp', 'aci', 'dpia', 'dBe', 'dfnl', 'dfin', 'dcoa', - 'dlno', 'dnoa', 'cla', 'cop', 'cto', 'dIi', 'dnco', 'dnca', 'ir', 'nop', 'ocp', - 'ia', 'ig','2ig','ivc','ndf','nsi','oa','poa','ta','ol','etr']] - -chars_a.reset_index(drop=True, inplace=True) -print(chars_a) -print('ME annual') -######################################## -# Quarterly # -######################################## -# bm -data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] - -# cfp -data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), - data_rawq['ibq4']/data_rawq['me'], - (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) - -# ep -data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] - -# lev -data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] - -# rdm -data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] - -# sp -data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] - -# alm -data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) - -# rsup -# data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4) -data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'] - -# sgrvol -data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') - -# Quarterly Accounting Variables -chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd','retadj' ,'acc', 'bm', 'cfp', - 'ep', 'agr', 'ni', 'op', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', - 'rdm', 'sgr', 'sp', 'invest', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'roe', - 'chato', 'chpm', 'chtx', 'noa', 'rna', 'pm', 'ato', 'stdcf', - 'grltnoa', 'ala', 'alm', 'rsup', 'stdacc', 'sgrvol', 'roavol', 'scf', 'cinvest', - 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'pscore', 'nincr', - 'turn', 'dolvol', 'iaq', 'almq', 'olq', 'rds']] - -chars_q.reset_index(drop=True, inplace=True) -print(chars_q) -print('ME quarterly') -with open('chars_a_60.pkl', 'wb') as f: - pkl.dump(chars_a, f) -print('pkl a') -with open('chars_q_60.pkl', 'wb') as f: - pkl.dump(chars_q, f) -print('pkl q') -print('Finished') \ No newline at end of file diff --git a/char60/accounting_60.py b/char60/accounting_60.py deleted file mode 100755 index d32b43c..0000000 --- a/char60/accounting_60.py +++ /dev/null @@ -1,1215 +0,0 @@ -import pandas as pd -import numpy as np -import wrds -from pandas.tseries.offsets import * -import pickle as pkl -from functions import * - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -####################################################################################################################### -# TTM functions # -####################################################################################################################### - - -def ttm4(series, df): - """ - - :param series: variables' name - :param df: dataframe - :return: ttm4 - """ - lag = pd.DataFrame() - for i in range(1, 4): - lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('permno')['%s' % series].shift(i) - result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] - return result - - -def ttm12(series, df): - """ - - :param series: variables' name - :param df: dataframe - :return: ttm12 - """ - lag = pd.DataFrame() - for i in range(1, 12): - lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('permno')['%s' % series].shift(i) - result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] +\ - lag['%s4' % series] + lag['%s5' % series] + lag['%s6' % series] + lag['%s7' % series] +\ - lag['%s8' % series] + lag['%s9' % series] + lag['%s10' % series] + lag['%s11' % series] - return result - - -####################################################################################################################### -# Compustat Block # -####################################################################################################################### -comp = conn.raw_sql(""" - /*header info*/ - select c.gvkey, f.cusip, f.datadate, f.fyear, c.cik, substr(c.sic,1,2) as sic2, c.sic, c.naics, - - /*firm variables*/ - /*income statement*/ - f.sale, f.revt, f.cogs, f.xsga, f.dp, f.xrd, f.xad, f.ib, f.ebitda, - f.ebit, f.nopi, f.spi, f.pi, f.txp, f.ni, f.txfed, f.txfo, f.txt, f.xint, - - /*CF statement and others*/ - f.capx, f.oancf, f.dvt, f.ob, f.gdwlia, f.gdwlip, f.gwo, f.mib, f.oiadp, f.ivao, - - /*assets*/ - f.rect, f.act, f.che, f.ppegt, f.invt, f.at, f.aco, f.intan, f.ao, f.ppent, f.gdwl, f.fatb, f.fatl, - - /*liabilities*/ - f.lct, f.dlc, f.dltt, f.lt, f.dm, f.dcvt, f.cshrc, - f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, - - /*equity and other*/ - f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, f.dpc, f.ajex, - - /*market*/ - abs(f.prcc_f) as prcc_f - - from comp.funda as f - left join comp.company as c - on f.gvkey = c.gvkey - - /*get consolidated, standardized, industrial format statements*/ - where f.indfmt = 'INDL' - and f.datafmt = 'STD' - and f.popsrc = 'D' - and f.consol = 'C' - and f.datadate >= '01/01/1959' - """) - -# convert datadate to date fmt -comp['datadate'] = pd.to_datetime(comp['datadate']) - -# sort and clean up -comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() - -# clean up csho -comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) - -# calculate Compustat market equity -comp['mve_f'] = comp['csho'] * comp['prcc_f'] - -# do some clean up. several variables have lots of missing values -condlist = [comp['drc'].notna() & comp['drlt'].notna(), - comp['drc'].notna() & comp['drlt'].isnull(), - comp['drlt'].notna() & comp['drc'].isnull()] -choicelist = [comp['drc']+comp['drlt'], - comp['drc'], - comp['drlt']] -comp['dr'] = np.select(condlist, choicelist, default=np.nan) - -condlist = [comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].notna() & comp['dcpstk'] > comp['pstk'], - comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].isnull()] -choicelist = [comp['dcpstk']-comp['pstk'], - comp['dcpstk']] -comp['dc'] = np.select(condlist, choicelist, default=np.nan) -comp['dc'] = np.where(comp['dc'].isnull(), comp['dcvt'], comp['dc']) - -comp['xint0'] = np.where(comp['xint'].isnull(), 0, comp['xint']) -comp['xsga0'] = np.where(comp['xsga'].isnull, 0, 0) - -comp['ceq'] = np.where(comp['ceq'] == 0, np.nan, comp['ceq']) -comp['at'] = np.where(comp['at'] == 0, np.nan, comp['at']) -comp = comp.dropna(subset=['at']) - -####################################################################################################################### -# CRSP Block # -####################################################################################################################### -# Create a CRSP Subsample with Monthly Stock and Event Variables -# Restrictions will be applied later -# Select variables from the CRSP monthly stock and event datasets -crsp = conn.raw_sql(""" - select a.prc, a.ret, a.retx, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.date, a.permno, a.permco, - b.ticker, b.ncusip, b.shrcd, b.exchcd - from crsp.msf as a - left join crsp.msenames as b - on a.permno=b.permno - and b.namedt<=a.date - and a.date<=b.nameendt - where a.date >= '01/01/1959' - and b.exchcd between 1 and 3 - """) - -# change variable format to int -crsp[['permco', 'permno', 'shrcd', 'exchcd']] = crsp[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) -crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month - -crsp = crsp.dropna(subset=['prc']) -crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity - -# if Market Equity is Nan then let return equals to 0 -crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) -crsp['retx'] = np.where(crsp['me'].isnull(), 0, crsp['retx']) - -# impute me -crsp = crsp.sort_values(by=['permno', 'date']).drop_duplicates() -crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) - -# Aggregate Market Cap -''' -There are cases when the same firm (permco) has two or more securities (permno) at same date. -For the purpose of ME for the firm, we aggregated all ME for a given permco, date. -This aggregated ME will be assigned to the permno with the largest ME. -''' -# sum of me across different permno belonging to same permco a given date -crsp_summe = crsp.groupby(['monthend', 'permco'])['me'].sum().reset_index() -# largest mktcap within a permco/date -crsp_maxme = crsp.groupby(['monthend', 'permco'])['me'].max().reset_index() -# join by monthend/maxme to find the permno -crsp1 = pd.merge(crsp, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) -# drop me column and replace with the sum me -crsp1 = crsp1.drop(['me'], axis=1) -# join with sum of me to get the correct market cap info -crsp2 = pd.merge(crsp1, crsp_summe, how='inner', on=['monthend', 'permco']) -# sort by permno and date and also drop duplicates -crsp2 = crsp2.sort_values(by=['permno', 'monthend']).drop_duplicates() - -####################################################################################################################### -# CCM Block # -####################################################################################################################### -# merge CRSP and Compustat -# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/linking-databases/linking-crsp-and-compustat/ -ccm = conn.raw_sql(""" - select gvkey, lpermno as permno, linktype, linkprim, - linkdt, linkenddt - from crsp.ccmxpf_linktable - where substr(linktype,1,1)='L' - and (linkprim ='C' or linkprim='P') - """) - -ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) -ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) - -# if linkenddt is missing then set to today date -ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) - -# merge ccm and comp -ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) - -# we can only get the accounting data after the firm public their report -# for annual data, we use 4, 5 or 6 months lagged data -ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) -ccm1['jdate'] = ccm1['datadate'] + MonthEnd(4) - -# set link date bounds -ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] - -# link comp and crsp -crsp2 = crsp2.rename(columns={'monthend': 'jdate'}) -data_rawa = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) - -# filter exchcd & shrcd -data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & - ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] - -# process Market Equity -''' -Note: me is CRSP market equity, mve_f is Compustat market equity. Please choose the me below. -''' -data_rawa['me'] = data_rawa['me']/1000 # CRSP ME -# data_rawa['me'] = data_rawa['mve_f'] # Compustat ME - -# there are some ME equal to zero since this company do not have price or shares data, we drop these observations -data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) -data_rawa = data_rawa.dropna(subset=['me']) - -# count single stock years -# data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() - -# deal with the duplicates -data_rawa.loc[data_rawa.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 -data_rawa = data_rawa[data_rawa['temp'].notna()] -data_rawa.loc[data_rawa.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 -data_rawa = data_rawa[data_rawa['temp'].notna()] - -data_rawa = data_rawa.sort_values(by=['permno', 'jdate']) - -####################################################################################################################### -# Annual Variables # -####################################################################################################################### -# preferrerd stock -data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) -data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) -data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps']) - -data_rawa['txditc'] = data_rawa['txditc'].fillna(0) - -# book equity -data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] -data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) - -# acc -data_rawa['act_l1'] = data_rawa.groupby(['permno'])['act'].shift(1) -data_rawa['lct_l1'] = data_rawa.groupby(['permno'])['lct'].shift(1) - -condlist = [data_rawa['np'].isnull(), - data_rawa['act'].isnull() | data_rawa['lct'].isnull()] -choicelist = [((data_rawa['act']-data_rawa['lct'])-(data_rawa['act_l1']-data_rawa['lct_l1'])/(10*data_rawa['be'])), - (data_rawa['ib']-data_rawa['oancf'])/(10*data_rawa['be'])] -data_rawa['acc'] = np.select(condlist, - choicelist, - default=((data_rawa['act']-data_rawa['lct']+data_rawa['np'])- - (data_rawa['act_l1']-data_rawa['lct_l1']+data_rawa['np'].shift(1)))/(10*data_rawa['be'])) - -# agr -data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1) -data_rawa['agr'] = (data_rawa['at']-data_rawa['at_l1'])/data_rawa['at_l1'] - -# bm -# data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] - -# cfp -# condlist = [data_rawa['dp'].isnull(), -# data_rawa['ib'].isnull()] -# choicelist = [data_rawa['ib']/data_rawa['me'], -# np.nan] -# data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) - -# ep -# data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] - -# ni -data_rawa['csho_l1'] = data_rawa.groupby(['permno'])['csho'].shift(1) -data_rawa['ajex_l1'] = data_rawa.groupby(['permno'])['ajex'].shift(1) -data_rawa['ni'] = np.where(data_rawa['gvkey'] != data_rawa['gvkey'].shift(1), - np.nan, - np.log(data_rawa['csho']*data_rawa['ajex']).replace(-np.inf, 0)- - np.log(data_rawa['csho_l1']*data_rawa['ajex_l1']).replace(-np.inf, 0)) - -# op -data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) -data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) -data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga']) - -condlist = [data_rawa['revt'].isnull(), data_rawa['be'].isnull()] -choicelist = [np.nan, np.nan] -data_rawa['op'] = np.select(condlist, choicelist, - default=(data_rawa['revt'] - data_rawa['cogs0'] - data_rawa['xsga0'] - data_rawa['xint0'])/data_rawa['be']) - -# rsup -data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) -# data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] - -# cash -data_rawa['cash'] = data_rawa['che']/data_rawa['at'] - -# lev -# data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] - -# sp -# data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] - -# rd_sale -data_rawa['rd_sale'] = data_rawa['xrd']/data_rawa['sale'] - -# rdm -# data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] - -# adm hxz adm -# data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] - -# gma -data_rawa['gma'] = (data_rawa['revt']-data_rawa['cogs'])/data_rawa['at_l1'] - -# chcsho -data_rawa['chcsho'] = (data_rawa['csho']/data_rawa['csho_l1'])-1 - -# lgr -data_rawa['lt_l1'] = data_rawa.groupby(['permno'])['lt'].shift(1) -data_rawa['lgr'] = (data_rawa['lt']/data_rawa['lt_l1'])-1 - -# pctacc -data_rawa['che_l1'] = data_rawa.groupby(['permno'])['che'].shift(1) -data_rawa['dlc_l1'] = data_rawa.groupby(['permno'])['dlc'].shift(1) -data_rawa['txp_l1'] = data_rawa.groupby(['permno'])['txp'].shift(1) - -condlist = [data_rawa['ib']==0, - data_rawa['oancf'].isnull(), - data_rawa['oancf'].isnull() & data_rawa['ib']==0] -choicelist = [(data_rawa['ib']-data_rawa['oancf'])/0.01, - ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1']))- - ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1']- - ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))/data_rawa['ib'].abs(), - ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1'])) - - ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1'] - - ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))] -data_rawa['pctacc'] = np.select(condlist, choicelist, default=(data_rawa['ib']-data_rawa['oancf'])/data_rawa['ib'].abs()) - -# sgr -data_rawa['sgr'] = (data_rawa['sale']/data_rawa['sale_l1'])-1 - -# chato -data_rawa['at_l2'] = data_rawa.groupby(['permno'])['at'].shift(2) -data_rawa['chato'] = (data_rawa['sale']/((data_rawa['at']+data_rawa['at_l1'])/2))-\ - (data_rawa['sale_l1']/((data_rawa['at']+data_rawa['at_l2'])/2)) - -# chtx -data_rawa['txt_l1'] = data_rawa.groupby(['permno'])['txt'].shift(1) -data_rawa['chtx'] = (data_rawa['txt']-data_rawa['txt_l1'])/data_rawa['at_l1'] - -# noa -data_rawa['noa'] = ((data_rawa['at']-data_rawa['che']-data_rawa['ivao'].fillna(0))- - (data_rawa['at']-data_rawa['dlc'].fillna(0)-data_rawa['dltt'].fillna(0)-data_rawa['mib'].fillna(0) - -data_rawa['pstk'].fillna(0)-data_rawa['ceq'])/data_rawa['at_l1']) - -# rna -data_rawa['noa_l1'] = data_rawa.groupby(['permno'])['noa'].shift(1) -data_rawa['rna'] = data_rawa['oiadp']/data_rawa['noa_l1'] - -# pm -data_rawa['pm'] = data_rawa['oiadp']/data_rawa['sale'] - -# ato -data_rawa['ato'] = data_rawa['sale']/data_rawa['noa_l1'] - -# depr -data_rawa['depr'] = data_rawa['dp']/data_rawa['ppent'] - -# invest -data_rawa['ppent_l1'] = data_rawa.groupby(['permno'])['ppent'].shift(1) -data_rawa['invt_l1'] = data_rawa.groupby(['permno'])['invt'].shift(1) - -data_rawa['invest'] = np.where(data_rawa['ppegt'].isnull(), ((data_rawa['ppent']-data_rawa['ppent_l1'])+ - (data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1'], - ((data_rawa['ppegt']-data_rawa['ppent_l1'])+(data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1']) - -# egr -data_rawa['ceq_l1'] = data_rawa.groupby(['permno'])['ceq'].shift(1) -data_rawa['egr'] = ((data_rawa['ceq']-data_rawa['ceq_l1'])/data_rawa['ceq_l1']) - -# cashdebt -data_rawa['cashdebt'] = (data_rawa['ib']+data_rawa['dp'])/((data_rawa['lt']+data_rawa['lt_l1'])/2) - -# rd -# if ((xrd/at)-(lag(xrd/lag(at))))/(lag(xrd/lag(at))) >.05 then rd=1 else rd=0 -data_rawa['xrd/at_l1'] = data_rawa['xrd']/data_rawa['at_l1'] -data_rawa['xrd/at_l1_l1'] = data_rawa.groupby(['permno'])['xrd/at_l1'].shift(1) -data_rawa['rd'] = np.where(((data_rawa['xrd']/data_rawa['at'])- - (data_rawa['xrd/at_l1_l1']))/data_rawa['xrd/at_l1_l1']>0.05, 1, 0) - -# roa -data_rawa['roa'] = data_rawa['ni']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# roe -data_rawa['roe'] = data_rawa['ib']/data_rawa['ceq_l1'] - -# dy -# data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] - -################## Added on 2020.07.28 ################## - -# roic -data_rawa['roic'] = (data_rawa['ebit'] - data_rawa['nopi'])/(data_rawa['ceq'] + data_rawa['lt'] - data_rawa['che']) - -# chinv -data_rawa['chinv'] = (data_rawa['invt'] - data_rawa['invt_l1'])/((data_rawa['at'] + data_rawa['at_l2'])/2) - -# pchsale_pchinvt -data_rawa['pchsale_pchinvt'] = ((data_rawa['sale'] - data_rawa['sale_l1'])/data_rawa['sale_l1'])\ - - ((data_rawa['invt']-data_rawa['invt_l1'])/data_rawa['invt_l1']) - -# pchsale_pchrect -data_rawa['rect_l1'] = data_rawa.groupby(['permno'])['rect'].shift(1) -data_rawa['pchsale_pchrect'] = ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale_l1'])\ - - ((data_rawa['rect']-data_rawa['rect_l1'])/data_rawa['rect_l1']) - -# pchgm_pchsale -data_rawa['cogs_l1'] = data_rawa.groupby(['permno'])['cogs'].shift(1) -data_rawa['pchgm_pchsale'] = (((data_rawa['sale']-data_rawa['cogs']) - - (data_rawa['sale_l1']-data_rawa['cogs_l1']))/(data_rawa['sale_l1']-data_rawa['cogs_l1']))\ - - ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale']) - -# pchsale_pchxsga -data_rawa['xsga_l1'] = data_rawa.groupby(['permno'])['xsga'].shift(1) -data_rawa['pchsale_pchxsga'] = ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale_l1'])\ - - ((data_rawa['xsga']-data_rawa['xsga_l1'])/data_rawa['xsga_l1']) - -# pchdepr -data_rawa['dp_l1'] = data_rawa.groupby(['permno'])['dp'].shift(1) -data_rawa['pchdepr'] = ((data_rawa['dp']/data_rawa['ppent'])-(data_rawa['dp_l1'] - /data_rawa['ppent_l1']))\ - / (data_rawa['dp_l1']/data_rawa['ppent']) - -# chadv -data_rawa['xad_l1'] = data_rawa.groupby(['permno'])['xad'].shift(1) -data_rawa['chadv'] = np.log(data_rawa['xad'] + 1) - np.log(data_rawa['xad_l1'] + 1) - -# pchcapx -data_rawa['capx_l1'] = data_rawa.groupby(['permno'])['capx'].shift(1) -data_rawa['pchcapx'] = (data_rawa['capx']-data_rawa['capx_l1'])/data_rawa['capx_l1'] - -# grcapx -data_rawa['capx_l2'] = data_rawa.groupby(['permno'])['capx'].shift(2) -data_rawa['grcapx'] = (data_rawa['capx']-data_rawa['capx_l2'])/data_rawa['capx_l2'] - -# grGW -data_rawa['gdwl_l1'] = data_rawa.groupby(['permno'])['gdwl'].shift(1) -data_rawa['grGW'] = (data_rawa['gdwl']-data_rawa['gdwl_l1'])/data_rawa['gdwl'] -condlist = [(data_rawa['gdwl']==0) | (data_rawa['gdwl'].isnull()), - (data_rawa['gdwl'].notna()) & (data_rawa['gdwl'] != 0) & (data_rawa['grGW'].isnull())] -choicelist = [0, 1] -data_rawa['grGW'] = np.select(condlist, choicelist, default=data_rawa['grGW']) - -# currat -data_rawa['currat'] = data_rawa['act']/data_rawa['lct'] - -# pchcurrat -data_rawa['pchcurrat'] = ((data_rawa['act']/data_rawa['lct'])-(data_rawa['act_l1']/data_rawa['lct_l1']))\ - /(data_rawa['act_l1']/data_rawa['lct_l1']) - -# quick -data_rawa['quick'] = (data_rawa['act']-data_rawa['invt'])/data_rawa['lct'] - -# pchquick -data_rawa['pchquick'] = ((data_rawa['act']-data_rawa['invt'])/data_rawa['lct'] - -(data_rawa['act_l1']-data_rawa['invt_l1'])/data_rawa['lct_l1'])\ - /((data_rawa['act_l1']-data_rawa['invt_l1'])/data_rawa['lct_l1']) - -# salecash -data_rawa['salecash'] = data_rawa['sale']/data_rawa['che'] - -# salerec -data_rawa['salerec']= data_rawa['sale']/data_rawa['rect'] - -# saleinv -data_rawa['saleinv'] = data_rawa['sale']/data_rawa['invt'] - -# pchsaleinv -data_rawa['pchsaleinv'] = ((data_rawa['sale']/data_rawa['invt'])-(data_rawa['sale_l1']/data_rawa['invt_l1']))\ - /(data_rawa['sale_l1']/data_rawa['invt_l1']) - -# realestate -data_rawa['realestate'] = (data_rawa['fatb']+data_rawa['fatl'])/data_rawa['ppegt'] -data_rawa['realestate'] = np.where(data_rawa['ppegt'].isnull(), - (data_rawa['fatb']+data_rawa['fatl'])/data_rawa['ppent'], data_rawa['realestate']) - -# obklg -data_rawa['obklg'] = data_rawa['ob']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# chobklg -data_rawa['ob_l1'] = data_rawa.groupby(['permno'])['ob'].shift(1) -data_rawa['chobklg'] = (data_rawa['ob'] - data_rawa['ob_l1'])/((data_rawa['at']+data_rawa['at_l1'])/2) - -# grltnoa -data_rawa['aco_l1'] = data_rawa.groupby(['permno'])['aco'].shift(1) -data_rawa['intan_l1'] = data_rawa.groupby(['permno'])['intan'].shift(1) -data_rawa['ao_l1'] = data_rawa.groupby(['permno'])['ao'].shift(1) -data_rawa['ap_l1'] = data_rawa.groupby(['permno'])['ap'].shift(1) -data_rawa['lco_l1'] = data_rawa.groupby(['permno'])['lco'].shift(1) -data_rawa['lo_l1'] = data_rawa.groupby(['permno'])['lo'].shift(1) -data_rawa['rect_l1'] = data_rawa.groupby(['permno'])['rect'].shift(1) - -data_rawa['grltnoa'] = ((data_rawa['rect']+data_rawa['invt']+data_rawa['ppent']+data_rawa['aco']+data_rawa['intan']+ - data_rawa['ao']-data_rawa['ap']-data_rawa['lco']-data_rawa['lo']) - -(data_rawa['rect_l1']+data_rawa['invt_l1']+data_rawa['ppent_l1']+data_rawa['aco_l1'] - +data_rawa['intan_l1']+data_rawa['ao_l1']-data_rawa['ap_l1']-data_rawa['lco_l1'] - -data_rawa['lo_l1']) - -(data_rawa['rect']-data_rawa['rect_l1']+data_rawa['invt']-data_rawa['invt_l1'] - +data_rawa['aco']-data_rawa['aco_l1'] - -(data_rawa['ap']-data_rawa['ap_l1']+data_rawa['lco']-data_rawa['lco_l1'])-data_rawa['dp']))\ - /((data_rawa['at']+data_rawa['at_l1'])/2) - -# conv -data_rawa['conv'] = data_rawa['dc']/data_rawa['dltt'] - -# chdrc -data_rawa['dr_l1'] = data_rawa.groupby(['permno'])['dr'].shift(1) -data_rawa['chdrc'] = (data_rawa['dr']-data_rawa['dr_l1'])/((data_rawa['at']+data_rawa['at_l1'])/2) - -# rdbias -data_rawa['xrd_l1'] = data_rawa.groupby(['permno'])['xrd'].shift(1) -data_rawa['rdbias'] = (data_rawa['xrd']/data_rawa['xrd_l1'])-1-data_rawa['ib']/data_rawa['ceq_l1'] - -# operprof -data_rawa['operprof'] = (data_rawa['revt']-data_rawa['cogs']-data_rawa['xsga0']-data_rawa['xint0'])/data_rawa['ceq_l1'] - -# cfroa -data_rawa['cfroa'] = data_rawa['oancf']/((data_rawa['at']+data_rawa['at_l1'])/2) -data_rawa['cfroa'] = np.where(data_rawa['oancf'].isnull(), - (data_rawa['ib'] + data_rawa['dp'])/((data_rawa['at']+data_rawa['at_l1'])/2), - data_rawa['cfroa']) - -# xrdint -data_rawa['xrdint'] = data_rawa['xrd']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# capxint -data_rawa['capxint'] = data_rawa['capx']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# xadint -data_rawa['xadint'] = data_rawa['xad']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# chpm -data_rawa['ib_l1'] = data_rawa.groupby(['permno'])['ib'].shift(1) -data_rawa['chpm'] = (data_rawa['ib']/data_rawa['sale'])-(data_rawa['ib_l1']/data_rawa['sale_l1']) - -# ala -data_rawa['gdwl'] = np.where(data_rawa['gdwl'].isnull(), 0, data_rawa['gdwl']) -data_rawa['intan'] = np.where(data_rawa['intan'].isnull(), 0, data_rawa['intan']) -data_rawa['ala'] = data_rawa['che']+0.75*(data_rawa['act']-data_rawa['che'])-\ - 0.5*(data_rawa['at']-data_rawa['act']-data_rawa['gdwl']-data_rawa['intan']) - -# alm -data_rawa['alm'] = data_rawa['ala']/(data_rawa['at']+data_rawa['prcc_f']*data_rawa['csho']-data_rawa['ceq']) - -# hire -data_rawa['emp_l1'] = data_rawa.groupby(['permno'])['emp'].shift(1) -data_rawa['hire'] = (data_rawa['emp'] - data_rawa['emp_l1'])/data_rawa['emp_l1'] -data_rawa['hire'] = np.where((data_rawa['emp'].isnull()) | (data_rawa['emp_l1'].isnull()), 0, data_rawa['hire']) - -# herf -data_rawa['sic'] = data_rawa['sic'].astype(int) -data_rawa['ffi49'] = ffi49(data_rawa) -data_rawa['ffi49'] = data_rawa['ffi49'].fillna(49) -data_rawa['ffi49'] = data_rawa['ffi49'].astype(int) -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['sale'].sum() -df_temp = df_temp.rename(columns={'sale': 'indsale'}) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) -data_rawa['herf'] = (data_rawa['sale']/data_rawa['indsale'])*(data_rawa['sale']/data_rawa['indsale']) -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['herf'].sum() -data_rawa = data_rawa.drop(['herf'], axis=1) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) - -####################################################################################################################### -# Compustat Quarterly Raw Info # -####################################################################################################################### -comp = conn.raw_sql(""" - /*header info*/ - select c.gvkey, f.cusip, f.datadate, f.fyearq, substr(c.sic,1,2) as sic2, c.sic, f.fqtr, f.rdq, - - /*income statement*/ - f.ibq, f.saleq, f.txtq, f.revtq, f.cogsq, f.xsgaq, f.revty, f.cogsy, f.saley, - - /*balance sheet items*/ - f.atq, f.actq, f.cheq, f.lctq, f.dlcq, f.ppentq, f.ppegtq, - - /*others*/ - abs(f.prccq) as prccq, abs(f.prccq)*f.cshoq as mveq_f, f.ceqq, f.seqq, f.pstkq, f.ltq, - f.pstkrq, f.gdwlq, f.intanq, f.mibq, f.oiadpq, f.ivaoq, - - /* v3 my formula add*/ - f.ajexq, f.cshoq, f.txditcq, f.npq, f.xrdy, f.xrdq, f.dpq, f.xintq, f.invtq, f.scstkcy, f.niq, - f.oancfy, f.dlttq, f.rectq, f.acoq, f.apq, f.lcoq, f.loq, f.aoq - - from comp.fundq as f - left join comp.company as c - on f.gvkey = c.gvkey - - /*get consolidated, standardized, industrial format statements*/ - where f.indfmt = 'INDL' - and f.datafmt = 'STD' - and f.popsrc = 'D' - and f.consol = 'C' - and f.datadate >= '01/01/1959' - """) - -# comp['cusip6'] = comp['cusip'].str.strip().str[0:6] -comp = comp.dropna(subset=['ibq']) - -# sort and clean up -comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() -comp['cshoq'] = np.where(comp['cshoq'] == 0, np.nan, comp['cshoq']) -comp['ceqq'] = np.where(comp['ceqq'] == 0, np.nan, comp['ceqq']) -comp['atq'] = np.where(comp['atq'] == 0, np.nan, comp['atq']) -comp = comp.dropna(subset=['atq']) - -# convert datadate to date fmt -comp['datadate'] = pd.to_datetime(comp['datadate']) - -# merge ccm and comp -ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) -ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) -ccm1['jdate'] = ccm1['datadate'] + MonthEnd(3) # we change quarterly lag here -# ccm1['jdate'] = ccm1['datadate']+MonthEnd(4) - -# set link date bounds -ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] - -# merge ccm2 and crsp2 -# crsp2['jdate'] = crsp2['monthend'] -data_rawq = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) - -# filter exchcd & shrcd -data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & - ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] - -# process Market Equity -''' -Note: me is CRSP market equity, mveq_f is Compustat market equity. Please choose the me below. -''' -data_rawq['me'] = data_rawq['me']/1000 # CRSP ME -# data_rawq['me'] = data_rawq['mveq_f'] # Compustat ME - -# there are some ME equal to zero since this company do not have price or shares data, we drop these observations -data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) -data_rawq = data_rawq.dropna(subset=['me']) - -# count single stock years -# data_rawq['count'] = data_rawq.groupby(['gvkey']).cumcount() - -# deal with the duplicates -data_rawq.loc[data_rawq.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 -data_rawq = data_rawq[data_rawq['temp'].notna()] -data_rawq.loc[data_rawq.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 -data_rawq = data_rawq[data_rawq['temp'].notna()] - -data_rawq = data_rawq.sort_values(by=['permno', 'jdate']) - -####################################################################################################################### -# Quarterly Variables # -####################################################################################################################### -# prepare be -data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) -data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) - -# dy -# data_rawq['me_l1'] = data_rawq.groupby(['permno'])['me'].shift(1) -# data_rawq['retdy'] = data_rawq['ret'] - data_rawq['retx'] -# data_rawq['mdivpay'] = data_rawq['retdy']*data_rawq['me_l1'] -# -# data_rawq['dy'] = ttm12(series='mdivpay', df=data_rawq)/data_rawq['me'] - -# chtx -data_rawq['txtq_l4'] = data_rawq.groupby(['permno'])['txtq'].shift(4) -data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) -data_rawq['chtx'] = (data_rawq['txtq']-data_rawq['txtq_l4'])/data_rawq['atq_l4'] - -# roa -data_rawq['atq_l1'] = data_rawq.groupby(['permno'])['atq'].shift(1) -data_rawq['roa'] = data_rawq['ibq']/data_rawq['atq_l1'] - -# cash -data_rawq['cash'] = data_rawq['cheq']/data_rawq['atq'] - -# acc -data_rawq['actq_l4'] = data_rawq.groupby(['permno'])['actq'].shift(4) -data_rawq['lctq_l4'] = data_rawq.groupby(['permno'])['lctq'].shift(4) -data_rawq['npq_l4'] = data_rawq.groupby(['permno'])['npq'].shift(4) -condlist = [data_rawq['npq'].isnull(), - data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] -choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/(10*data_rawq['beq']), - np.nan] -data_rawq['acc'] = np.select(condlist, choicelist, - default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])- - (data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/(10*data_rawq['beq'])) - -# bm -# data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] - -# cfp -data_rawq['ibq4'] = ttm4('ibq', data_rawq) -data_rawq['dpq4'] = ttm4('dpq', data_rawq) -# data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), -# data_rawq['ibq4']/data_rawq['me'], -# (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) - -# ep -# data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] - -# agr -data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'] - -# ni -data_rawq['cshoq_l4'] = data_rawq.groupby(['permno'])['cshoq'].shift(4) -data_rawq['ajexq_l4'] = data_rawq.groupby(['permno'])['ajexq'].shift(4) -data_rawq['ni'] = np.where(data_rawq['cshoq'].isnull(), np.nan, - np.log(data_rawq['cshoq']*data_rawq['ajexq']).replace(-np.inf, 0)-np.log(data_rawq['cshoq_l4']*data_rawq['ajexq_l4'])) - -# op -data_rawq['xintq0'] = np.where(data_rawq['xintq'].isnull(), 0, data_rawq['xintq']) -data_rawq['xsgaq0'] = np.where(data_rawq['xsgaq'].isnull(), 0, data_rawq['xsgaq']) -data_rawq['beq_l4'] = data_rawq.groupby(['permno'])['beq'].shift(4) - -data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] - -# chcsho -data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1 - -# cashdebt -data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4) -data_rawq['cashdebt'] = (ttm4('ibq', data_rawq) + ttm4('dpq', data_rawq))/((data_rawq['ltq']+data_rawq['ltq_l4'])/2) - -# rd -data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) -data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4']) - -data_rawq['xrdq4/atq_l4'] = data_rawq['xrdq4']/data_rawq['atq_l4'] -data_rawq['xrdq4/atq_l4_l4'] = data_rawq.groupby(['permno'])['xrdq4/atq_l4'].shift(4) -data_rawq['rd'] = np.where(((data_rawq['xrdq4']/data_rawq['atq'])-data_rawq['xrdq4/atq_l4_l4'])/data_rawq['xrdq4/atq_l4_l4']>0.05, 1, 0) - -# pctacc -condlist = [data_rawq['npq'].isnull(), - data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] -choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/abs(ttm4('ibq', data_rawq)), np.nan] -data_rawq['pctacc'] = np.select(condlist, choicelist, - default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/ - abs(ttm4('ibq', data_rawq))) - -# gma -data_rawq['revtq4'] = ttm4('revtq', data_rawq) -data_rawq['cogsq4'] = ttm4('cogsq', data_rawq) -data_rawq['gma'] = (data_rawq['revtq4']-data_rawq['cogsq4'])/data_rawq['atq_l4'] - -# lev -# data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] - -# rdm -# data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] - -# sgr -data_rawq['saleq4'] = ttm4('saleq', data_rawq) -data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) - -data_rawq['saleq4_l4'] = data_rawq.groupby(['permno'])['saleq4'].shift(4) -data_rawq['sgr'] = (data_rawq['saleq4']/data_rawq['saleq4_l4'])-1 - -# sp -# data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] - -# invest -data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) -data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) -data_rawq['ppegtq_l4'] = data_rawq.groupby(['permno'])['ppegtq'].shift(4) - -data_rawq['invest'] = np.where(data_rawq['ppegtq'].isnull(), ((data_rawq['ppentq']-data_rawq['ppentq_l4'])+ - (data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4'], - ((data_rawq['ppegtq']-data_rawq['ppegtq_l4'])+(data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4']) - -# rd_sale -data_rawq['rd_sale'] = data_rawq['xrdq4']/data_rawq['saleq4'] - -# lgr -data_rawq['lgr'] = (data_rawq['ltq']/data_rawq['ltq_l4'])-1 - -# depr -data_rawq['depr'] = ttm4('dpq', data_rawq)/data_rawq['ppentq'] - -# egr -data_rawq['ceqq_l4'] = data_rawq.groupby(['permno'])['ceqq'].shift(4) -data_rawq['egr'] = (data_rawq['ceqq']-data_rawq['ceqq_l4'])/data_rawq['ceqq_l4'] - -# chpm -data_rawq['ibq4_l1'] = data_rawq.groupby(['permno'])['ibq4'].shift(1) -data_rawq['saleq4_l1'] = data_rawq.groupby(['permno'])['saleq4'].shift(1) - -data_rawq['chpm'] = (data_rawq['ibq4']/data_rawq['saleq4'])-(data_rawq['ibq4_l1']/data_rawq['saleq4_l1']) - -# chato -data_rawq['atq_l8'] = data_rawq.groupby(['permno'])['atq'].shift(8) -data_rawq['chato'] = (data_rawq['saleq4']/((data_rawq['atq']+data_rawq['atq_l4'])/2))-(data_rawq['saleq4_l4']/((data_rawq['atq_l4']+data_rawq['atq_l8'])/2)) - -# noa -data_rawq['ivaoq'] = np.where(data_rawq['ivaoq'].isnull(), 0, 1) -data_rawq['dlcq'] = np.where(data_rawq['dlcq'].isnull(), 0, 1) -data_rawq['dlttq'] = np.where(data_rawq['dlttq'].isnull(), 0, 1) -data_rawq['mibq'] = np.where(data_rawq['mibq'].isnull(), 0, 1) -data_rawq['pstkq'] = np.where(data_rawq['pstkq'].isnull(), 0, 1) -data_rawq['noa'] = (data_rawq['atq']-data_rawq['cheq']-data_rawq['ivaoq'])-\ - (data_rawq['atq']-data_rawq['dlcq']-data_rawq['dlttq']-data_rawq['mibq']-data_rawq['pstkq']-data_rawq['ceqq'])/data_rawq['atq_l4'] - -# rna -data_rawq['noa_l4'] = data_rawq.groupby(['permno'])['noa'].shift(4) -data_rawq['rna'] = data_rawq['oiadpq']/data_rawq['noa_l4'] - -# pm -data_rawq['pm'] = data_rawq['oiadpq']/data_rawq['saleq'] - -# ato -data_rawq['ato'] = data_rawq['saleq']/data_rawq['noa_l4'] - -# roe -data_rawq['ceqq_l1'] = data_rawq.groupby(['permno'])['ceqq'].shift(1) -data_rawq['roe'] = data_rawq['ibq']/data_rawq['ceqq_l1'] - -################################## New Added ################################## - -# grltnoa -data_rawq['rectq_l4'] = data_rawq.groupby(['permno'])['rectq'].shift(4) -data_rawq['acoq_l4'] = data_rawq.groupby(['permno'])['acoq'].shift(4) -data_rawq['apq_l4'] = data_rawq.groupby(['permno'])['apq'].shift(4) -data_rawq['lcoq_l4'] = data_rawq.groupby(['permno'])['lcoq'].shift(4) -data_rawq['loq_l4'] = data_rawq.groupby(['permno'])['loq'].shift(4) -data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) -data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) -data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) - -data_rawq['grltnoa'] = ((data_rawq['rectq']+data_rawq['invtq']+data_rawq['ppentq']+data_rawq['acoq']+data_rawq['intanq']+ - data_rawq['aoq']-data_rawq['apq']-data_rawq['lcoq']-data_rawq['loq'])- - (data_rawq['rectq_l4']+data_rawq['invtq_l4']+data_rawq['ppentq_l4']+data_rawq['acoq_l4']-data_rawq['apq_l4']-data_rawq['lcoq_l4']-data_rawq['loq_l4'])-\ - (data_rawq['rectq']-data_rawq['rectq_l4']+data_rawq['invtq']-data_rawq['invtq_l4']+data_rawq['acoq']- - (data_rawq['apq']-data_rawq['apq_l4']+data_rawq['lcoq']-data_rawq['lcoq_l4'])- - ttm4('dpq', data_rawq)))/((data_rawq['atq']+data_rawq['atq_l4'])/2) - -# scal -# condlist = [data_rawq['seqq'].isnull(), -# data_rawq['seqq'].isnull() & (data_rawq['ceqq'].isnull() | data_rawq['pstk'].isnull())] -# choicelist = [data_rawq['ceqq']+data_rawq['pstk'], -# data_rawq['atq']-data_rawq['ltq']] -# data_rawq['scal'] = np.select(condlist, choicelist, default=data_rawq['seqq']) - -# ala -data_rawq['gdwlq'] = np.where(data_rawq['gdwlq'].isnull(), 0, data_rawq['gdwlq']) -data_rawq['intanq'] = np.where(data_rawq['intanq'].isnull(), 0, data_rawq['intanq']) -data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ - 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) - -# alm -# data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) - -# rsup -data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4) -# data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'] - -# stdsacc -data_rawq['actq_l1'] = data_rawq.groupby(['permno'])['actq'].shift(1) -data_rawq['cheq_l1'] = data_rawq.groupby(['permno'])['cheq'].shift(1) -data_rawq['lctq_l1'] = data_rawq.groupby(['permno'])['lctq'].shift(1) -data_rawq['dlcq_l1'] = data_rawq.groupby(['permno'])['dlcq'].shift(1) - -data_rawq['sacc'] = ((data_rawq['actq']-data_rawq['actq_l1'] - (data_rawq['cheq']-data_rawq['cheq_l1'])) - -((data_rawq['lctq']-data_rawq['lctq_l1'])-(data_rawq['dlcq']-data_rawq['dlcq_l1'])))/data_rawq['saleq'] -data_rawq['sacc'] = np.where(data_rawq['saleq']<=0, ((data_rawq['actq']-data_rawq['actq_l1'] - (data_rawq['cheq']-data_rawq['cheq_l1'])) - -((data_rawq['lctq']-data_rawq['lctq_l1'])-(data_rawq['dlcq']-data_rawq['dlcq_l1'])))/0.01, data_rawq['sacc']) - - -def chars_std(start, end, df, chars): - """ - - :param start: Order of starting lag - :param end: Order of ending lag - :param df: Dataframe - :param chars: lag chars - :return: std of factor - """ - lag = pd.DataFrame() - lag_list = [] - for i in range(start, end): - lag['chars_l%s' % i] = df.groupby(['permno'])['%s' % chars].shift(i) - lag_list.append('chars_l%s' % i) - result = lag[lag_list].std(axis=1) - return result - -data_rawq['stdacc'] = chars_std(0, 16, data_rawq, 'sacc') - -# sgrvol -# data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') - -# roavol -data_rawq['roavol'] = chars_std(0, 16, data_rawq, 'roa') - -# stdcf -data_rawq['scf'] = (data_rawq['ibq']/data_rawq['saleq']) - data_rawq['sacc'] -data_rawq['scf'] = np.where(data_rawq['saleq']<=0, (data_rawq['ibq']/0.01) - data_rawq['sacc'], data_rawq['sacc']) - -data_rawq['stdcf'] = chars_std(0, 16, data_rawq, 'scf') - -# cinvest -data_rawq['ppentq_l1'] = data_rawq.groupby(['permno'])['ppentq'].shift(1) -data_rawq['ppentq_l2'] = data_rawq.groupby(['permno'])['ppentq'].shift(2) -data_rawq['ppentq_l3'] = data_rawq.groupby(['permno'])['ppentq'].shift(3) -data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) -data_rawq['saleq_l1'] = data_rawq.groupby(['permno'])['saleq'].shift(1) -data_rawq['saleq_l2'] = data_rawq.groupby(['permno'])['saleq'].shift(2) -data_rawq['saleq_l3'] = data_rawq.groupby(['permno'])['saleq'].shift(3) - -data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / data_rawq['saleq_l1'] -data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / data_rawq['saleq_l2'] -data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / data_rawq['saleq_l3'] - -data_rawq['cinvest'] = ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / data_rawq['saleq'])\ - -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)) - -data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / 0.01 -data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / 0.01 -data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / 0.01 - -data_rawq['cinvest'] = np.where(data_rawq['saleq']<=0, ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / 0.01) - -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)), data_rawq['cinvest']) - -data_rawq = data_rawq.drop(['c_temp1', 'c_temp2', 'c_temp3'], axis=1) - -# nincr -data_rawq['ibq_l1'] = data_rawq.groupby(['permno'])['ibq'].shift(1) -data_rawq['ibq_l2'] = data_rawq.groupby(['permno'])['ibq'].shift(2) -data_rawq['ibq_l3'] = data_rawq.groupby(['permno'])['ibq'].shift(3) -data_rawq['ibq_l4'] = data_rawq.groupby(['permno'])['ibq'].shift(4) -data_rawq['ibq_l5'] = data_rawq.groupby(['permno'])['ibq'].shift(5) -data_rawq['ibq_l6'] = data_rawq.groupby(['permno'])['ibq'].shift(6) -data_rawq['ibq_l7'] = data_rawq.groupby(['permno'])['ibq'].shift(7) -data_rawq['ibq_l8'] = data_rawq.groupby(['permno'])['ibq'].shift(8) - -data_rawq['nincr_temp1'] = np.where(data_rawq['ibq'] > data_rawq['ibq_l1'], 1, 0) -data_rawq['nincr_temp2'] = np.where(data_rawq['ibq_l1'] > data_rawq['ibq_l2'], 1, 0) -data_rawq['nincr_temp3'] = np.where(data_rawq['ibq_l2'] > data_rawq['ibq_l3'], 1, 0) -data_rawq['nincr_temp4'] = np.where(data_rawq['ibq_l3'] > data_rawq['ibq_l4'], 1, 0) -data_rawq['nincr_temp5'] = np.where(data_rawq['ibq_l4'] > data_rawq['ibq_l5'], 1, 0) -data_rawq['nincr_temp6'] = np.where(data_rawq['ibq_l5'] > data_rawq['ibq_l6'], 1, 0) -data_rawq['nincr_temp7'] = np.where(data_rawq['ibq_l6'] > data_rawq['ibq_l7'], 1, 0) -data_rawq['nincr_temp8'] = np.where(data_rawq['ibq_l7'] > data_rawq['ibq_l8'], 1, 0) - -data_rawq['nincr'] = (data_rawq['nincr_temp1'] - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']) - + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']*data_rawq['nincr_temp8'])) - -data_rawq = data_rawq.drop(['ibq_l1', 'ibq_l2', 'ibq_l3', 'ibq_l4', 'ibq_l5', 'ibq_l6', 'ibq_l7', 'ibq_l8', 'nincr_temp1', - 'nincr_temp2', 'nincr_temp3', 'nincr_temp4', 'nincr_temp5', 'nincr_temp6', 'nincr_temp7', - 'nincr_temp8'], axis=1) - -# performance score -data_rawq['niq4'] = ttm4(series='niq', df=data_rawq) -data_rawq['niq4_l4'] = data_rawq.groupby(['permno'])['niq4'].shift(4) -data_rawq['dlttq_l4'] = data_rawq.groupby(['permno'])['dlttq'].shift(4) -data_rawq['p_temp1'] = np.where(data_rawq['niq4']>0, 1, 0) -data_rawq['p_temp2'] = np.where(data_rawq['oancfy']>0, 1, 0) -data_rawq['p_temp3'] = np.where(data_rawq['niq4']/data_rawq['atq']>data_rawq['niq4_l4']/data_rawq['atq_l4'], 1, 0) -data_rawq['p_temp4'] = np.where(data_rawq['oancfy']>data_rawq['niq4'], 1, 0) -data_rawq['p_temp5'] = np.where(data_rawq['dlttq']/data_rawq['atq'] data_rawq['actq_l4']/data_rawq['lctq_l4'], 1, 0) -data_rawq['cogsq4_l4'] = data_rawq.groupby(['permno'])['cogsq4'].shift(4) -data_rawq['p_temp7'] = np.where((data_rawq['saleq4']-data_rawq['cogsq4']/data_rawq['saleq4'])>(data_rawq['saleq4_l4']-data_rawq['cogsq4_l4']/data_rawq['saleq4_l4']), 1, 0) -data_rawq['p_temp8'] = np.where(data_rawq['saleq4']/data_rawq['atq']>data_rawq['saleq4_l4']/data_rawq['atq_l4'], 1, 0) -data_rawq['p_temp9'] = np.where(data_rawq['scstkcy']==0, 1, 0) - -data_rawq['pscore'] = data_rawq['p_temp1']+data_rawq['p_temp2']+data_rawq['p_temp3']+data_rawq['p_temp4']\ - +data_rawq['p_temp5']+data_rawq['p_temp6']+data_rawq['p_temp7']+data_rawq['p_temp8']\ - +data_rawq['p_temp9'] - -data_rawq = data_rawq.drop(['p_temp1', 'p_temp2', 'p_temp3', 'p_temp4', 'p_temp5', 'p_temp6', 'p_temp7', 'p_temp8', - 'p_temp9'], axis=1) - -####################################################################################################################### -# Momentum # -####################################################################################################################### -crsp_mom = conn.raw_sql(""" - select permno, date, ret, retx, prc, shrout, vol - from crsp.msf - where date >= '01/01/1959' - """) - -crsp_mom['permno'] = crsp_mom['permno'].astype(int) -crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) -crsp_mom = crsp_mom.dropna(subset=['ret', 'retx', 'prc']) - -# add delisting return -dlret = conn.raw_sql(""" - select permno, dlret, dlstdt - from crsp.msedelist - """) - -dlret.permno = dlret.permno.astype(int) -dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) -dlret['jdate'] = dlret['dlstdt'] + MonthEnd(0) - -# merge delisting return to crsp return -crsp_mom = pd.merge(crsp_mom, dlret, how='left', on=['permno', 'jdate']) -crsp_mom['dlret'] = crsp_mom['dlret'].fillna(0) -crsp_mom['ret'] = crsp_mom['ret'].fillna(0) -crsp_mom['retadj'] = (1 + crsp_mom['ret']) * (1 + crsp_mom['dlret']) - 1 -crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity - - -def mom(start, end, df): - """ - - :param start: Order of starting lag - :param end: Order of ending lag - :param df: Dataframe - :return: Momentum factor - """ - lag = pd.DataFrame() - result = 1 - for i in range(start, end): - lag['mom%s' % i] = df.groupby(['permno'])['ret'].shift(i) - result = result * (1+lag['mom%s' % i]) - result = result - 1 - return result - - -crsp_mom['mom60m'] = mom(12, 60, crsp_mom) -crsp_mom['mom12m'] = mom(1, 12, crsp_mom) -crsp_mom['mom1m'] = crsp_mom['ret'] -crsp_mom['mom6m'] = mom(1, 6, crsp_mom) -crsp_mom['mom36m'] = mom(1, 36, crsp_mom) -crsp_mom['seas1a'] = crsp_mom.groupby(['permno'])['ret'].shift(11) - -crsp_mom['vol_l1'] = crsp_mom.groupby(['permno'])['vol'].shift(1) -crsp_mom['vol_l2'] = crsp_mom.groupby(['permno'])['vol'].shift(2) -crsp_mom['vol_l3'] = crsp_mom.groupby(['permno'])['vol'].shift(3) -crsp_mom['prc_l2'] = crsp_mom.groupby(['permno'])['prc'].shift(2) -crsp_mom['dolvol'] = np.log(crsp_mom['vol_l2']*crsp_mom['prc_l2']).replace([np.inf, -np.inf], np.nan) -crsp_mom['turn'] = ((crsp_mom['vol_l1']+crsp_mom['vol_l2']+crsp_mom['vol_l3'])/3)/crsp_mom['shrout'] - -# dy -crsp_mom['me_l1'] = crsp_mom.groupby(['permno'])['me'].shift(1) -crsp_mom['retdy'] = crsp_mom['ret'] - crsp_mom['retx'] -crsp_mom['mdivpay'] = crsp_mom['retdy']*crsp_mom['me_l1'] - -crsp_mom['dy'] = ttm12(series='mdivpay', df=crsp_mom)/crsp_mom['me'] - -# def moms(start, end, df): -# """ -# -# :param start: Order of starting lag -# :param end: Order of ending lag -# :param df: Dataframe -# :return: Momentum factor -# """ -# lag = pd.DataFrame() -# result = 1 -# for i in range(start, end): -# lag['moms%s' % i] = df.groupby['permno']['ret'].shift(i) -# result = result + lag['moms%s' % i] -# result = result/11 -# return result -# -# -# crsp_mom['moms12m'] = moms(1, 12, crsp_mom) - -# populate the chars to monthly - -# data_rawa -data_rawa = data_rawa.drop(['date', 'ret', 'retx', 'me'], axis=1) -data_rawa = pd.merge(crsp_mom, data_rawa, how='left', on=['permno', 'jdate']) -data_rawa['datadate'] = data_rawa.groupby(['permno'])['datadate'].fillna(method='ffill') -data_rawa = data_rawa.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') -data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & - ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] - -# data_rawq -data_rawq = data_rawq.drop(['date', 'ret', 'retx', 'me'], axis=1) -data_rawq = pd.merge(crsp_mom, data_rawq, how='left', on=['permno', 'jdate']) -data_rawq['datadate'] = data_rawq.groupby(['permno'])['datadate'].fillna(method='ffill') -data_rawq = data_rawq.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') -data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & - ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] - -####################################################################################################################### -# Monthly ME # -####################################################################################################################### - -######################################## -# Annual # -######################################## - -# bm -data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] - -# bm_ia -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() -df_temp = df_temp.rename(columns={'bm': 'bm_ind'}) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) -data_rawa['bm_ia'] = data_rawa['bm']/data_rawa['bm_ind'] - -# me_ia -df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['me'].mean() -df_temp = df_temp.rename(columns={'me': 'me_ind'}) -data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) -data_rawa['me_ia'] = data_rawa['me']/data_rawa['me_ind'] - -# cfp -condlist = [data_rawa['dp'].isnull(), - data_rawa['ib'].isnull()] -choicelist = [data_rawa['ib']/data_rawa['me'], - np.nan] -data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) - -# ep -data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] - -# rsup -# data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) -data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] - -# lev -data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] - -# sp -data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] - -# rdm -data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] - -# adm hxz adm -data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] - -# dy -data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] - -# Annual Accounting Variables -chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', - 'sic', 'ret', 'retx', 'retadj', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', - 'rsup', 'cash', 'chcsho', - 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'adm', 'sgr', 'sp', 'invest', 'roe', - 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'chato', 'chtx', 'noa', 'rna', 'pm', 'ato', 'dy', - 'roic', 'chinv', 'pchsale_pchinvt', 'pchsale_pchrect', 'pchgm_pchsale', 'pchsale_pchxsga', - 'pchdepr', 'chadv', 'pchcapx', 'grcapx', 'grGW', 'currat', 'pchcurrat', 'quick', 'pchquick', - 'salecash', 'salerec', 'saleinv', 'pchsaleinv', 'realestate', 'obklg', 'chobklg', 'grltnoa', - 'conv', 'chdrc', 'rdbias', 'operprof', 'capxint', 'xadint', 'chpm', 'ala', 'alm', - 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'hire', 'herf', 'bm_ia', - 'me_ia', 'turn', 'dolvol']] -chars_a.reset_index(drop=True, inplace=True) - -######################################## -# Quarterly # -######################################## -# bm -data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] - -# cfp -data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), - data_rawq['ibq4']/data_rawq['me'], - (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) - -# ep -data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] - -# lev -data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] - -# rdm -data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] - -# sp -data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] - -# alm -data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) - -# rsup -# data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4) -data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'] - -# sgrvol -data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') - -# Quarterly Accounting Variables -chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd', - 'ret', 'retx', 'retadj', 'acc', 'bm', 'cfp', - 'ep', 'agr', 'ni', 'op', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', - 'rdm', 'sgr', 'sp', 'invest', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'roe', - 'chato', 'chpm', 'chtx', 'noa', 'rna', 'pm', 'ato', 'stdcf', - 'grltnoa', 'ala', 'alm', 'rsup', 'stdacc', 'sgrvol', 'roavol', 'scf', 'cinvest', - 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'pscore', 'nincr', - 'turn', 'dolvol']] -chars_q.reset_index(drop=True, inplace=True) - -with open('chars_a_60.pkl', 'wb') as f: - pkl.dump(chars_a, f) - -with open('chars_q_60.pkl', 'wb') as f: - pkl.dump(chars_q, f) diff --git a/char60/beta.py b/char60/beta.py deleted file mode 100755 index ff5cca5..0000000 --- a/char60/beta.py +++ /dev/null @@ -1,164 +0,0 @@ -# Fama & French 3 factors residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf, b.smb, b.hml - from crsp.dsf as a - left join ff.factors_daily as b - on a.date=b.date - where a.date > '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate the beta # -###################### - - -def get_beta(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - # if observations in last 3 months are less 21, we drop the rvar of this month - if temp['permno'].count() < 21: - pass - else: - rolling_window = temp['permno'].count() - index = temp.tail(1).index - X = np.mat(temp[['mktrf']]) - Y = np.mat(temp[['exret']]) - ones = np.mat(np.ones(rolling_window)).T - M = np.identity(rolling_window) - ones.dot((ones.T.dot(ones)).I).dot(ones.T) - beta = (X.T.dot(M).dot(X)).I.dot((X.T.dot(M).dot(Y))) - df.loc[index, 'beta'] = beta - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_beta, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['beta']) # drop NA due to rolling -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'beta']] - -with open('beta.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/bid_ask_spread.py b/char60/bid_ask_spread.py deleted file mode 100755 index 5281099..0000000 --- a/char60/bid_ask_spread.py +++ /dev/null @@ -1,160 +0,0 @@ -# Fama & French 3 factors residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, a.askhi, a.bidlo - from crsp.dsf as a - left join ff.factors_daily as b - on a.date=b.date - where a.date > '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate residual # -###################### - - -def get_baspread(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - if temp['permno'].count() < 21: - pass - else: - index = temp.tail(1).index - X = pd.DataFrame() - X[['askhi', 'bidlo']] = temp[['askhi', 'bidlo']] - bid = (X['askhi'] - X['bidlo'])/((X['askhi'] + X['bidlo'])/2).mean() - df.loc[index, 'baspread'] = bid - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['baspread']) # drop NA due to rolling -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'baspread']] - -with open('baspread.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/functions.py b/char60/functions.py deleted file mode 100755 index 34cd370..0000000 --- a/char60/functions.py +++ /dev/null @@ -1,452 +0,0 @@ -import pandas as pd -import pickle as pkl -import numpy as np -from tqdm import tqdm -import re - -def ffi49(df): - condlist = [((100 <= df['sic']) & (df['sic'] <= 199)) | ((200 <= df['sic']) & (df['sic'] <= 299)) | - ((700 <= df['sic']) & (df['sic'] <= 799)) | ((910 <= df['sic']) & (df['sic'] <= 919)) | - ((2048 <= df['sic']) & (df['sic'] <= 2048)), - ((2000 <= df['sic']) & (df['sic'] <= 2009)) | ((2010 <= df['sic']) & (df['sic'] <= 2019)) | - ((2020 <= df['sic']) & (df['sic'] <= 2029)) | ((2030 <= df['sic']) & (df['sic'] <= 2039)) | - ((2040 <= df['sic']) & (df['sic'] <= 2046)) | ((2050 <= df['sic']) & (df['sic'] <= 2059)) | - ((2060 <= df['sic']) & (df['sic'] <= 2063)) | ((2070 <= df['sic']) & (df['sic'] <= 2079)) | - ((2090 <= df['sic']) & (df['sic'] <= 2092)) | ((2095 <= df['sic']) & (df['sic'] <= 2095)) | - ((2098 <= df['sic']) & (df['sic'] <= 2099)), - ((2064 <= df['sic']) & (df['sic'] <= 2068)) | ((2086 <= df['sic']) & (df['sic'] <= 2086)) | - ((2087 <= df['sic']) & (df['sic'] <= 2087)) | ((2096 <= df['sic']) & (df['sic'] <= 2096)) | - ((2097 <= df['sic']) & (df['sic'] <= 2097)), - ((2080 <= df['sic']) & (df['sic'] <= 2080)) | ((2082 <= df['sic']) & (df['sic'] <= 2082)) | - ((2083 <= df['sic']) & (df['sic'] <= 2083)) | ((2084 <= df['sic']) & (df['sic'] <= 2084)) | - ((2085 <= df['sic']) & (df['sic'] <= 2085)), - ((2100 <= df['sic']) & (df['sic'] <= 2199)), - ((920 <= df['sic']) & (df['sic'] <= 999)) | ((3650 <= df['sic']) & (df['sic'] <= 3651)) | - ((3652 <= df['sic']) & (df['sic'] <= 3652)) | ((3732 <= df['sic']) & (df['sic'] <= 3732)) | - ((3930 <= df['sic']) & (df['sic'] <= 3931)) | ((3940 <= df['sic']) & (df['sic'] <= 3949)), - ((7800 <= df['sic']) & (df['sic'] <= 7829)) | ((7830 <= df['sic']) & (df['sic'] <= 7833)) | - ((7840 <= df['sic']) & (df['sic'] <= 7841)) | ((7900 <= df['sic']) & (df['sic'] <= 7900)) | - ((7910 <= df['sic']) & (df['sic'] <= 7911)) | ((7920 <= df['sic']) & (df['sic'] <= 7929)) | - ((7930 <= df['sic']) & (df['sic'] <= 7933)) | ((7940 <= df['sic']) & (df['sic'] <= 7949)) | - ((7980 <= df['sic']) & (df['sic'] <= 7980)) | ((7990 <= df['sic']) & (df['sic'] <= 7999)), - ((2700 <= df['sic']) & (df['sic'] <= 2709)) | ((2710 <= df['sic']) & (df['sic'] <= 2719)) | - ((2720 <= df['sic']) & (df['sic'] <= 2729)) | ((2730 <= df['sic']) & (df['sic'] <= 2739)) | - ((2740 <= df['sic']) & (df['sic'] <= 2749)) | ((2770 <= df['sic']) & (df['sic'] <= 2771)) | - ((2780 <= df['sic']) & (df['sic'] <= 2789)) | ((2790 <= df['sic']) & (df['sic'] <= 2799)), - ((2047 <= df['sic']) & (df['sic'] <= 2047)) | ((2391 <= df['sic']) & (df['sic'] <= 2392)) | - ((2510 <= df['sic']) & (df['sic'] <= 2519)) | ((2590 <= df['sic']) & (df['sic'] <= 2599)) | - ((2840 <= df['sic']) & (df['sic'] <= 2843)) | ((2844 <= df['sic']) & (df['sic'] <= 2844)) | - ((3160 <= df['sic']) & (df['sic'] <= 3161)) | ((3170 <= df['sic']) & (df['sic'] <= 3171)) | - ((3172 <= df['sic']) & (df['sic'] <= 3172)) | ((3190 <= df['sic']) & (df['sic'] <= 3199)) | - ((3229 <= df['sic']) & (df['sic'] <= 3229)) | ((3260 <= df['sic']) & (df['sic'] <= 3260)) | - ((3262 <= df['sic']) & (df['sic'] <= 3263)) | ((3269 <= df['sic']) & (df['sic'] <= 3269)) | - ((3230 <= df['sic']) & (df['sic'] <= 3231)) | ((3630 <= df['sic']) & (df['sic'] <= 3639)) | - ((3750 <= df['sic']) & (df['sic'] <= 3751)) | ((3800 <= df['sic']) & (df['sic'] <= 3800)) | - ((3860 <= df['sic']) & (df['sic'] <= 3861)) | ((3870 <= df['sic']) & (df['sic'] <= 3873)) | - ((3910 <= df['sic']) & (df['sic'] <= 3911)) | ((3914 <= df['sic']) & (df['sic'] <= 3914)) | - ((3915 <= df['sic']) & (df['sic'] <= 3915)) | ((3960 <= df['sic']) & (df['sic'] <= 3962)) | - ((3991 <= df['sic']) & (df['sic'] <= 3991)) | ((3995 <= df['sic']) & (df['sic'] <= 3995)), - ((2300 <= df['sic']) & (df['sic'] <= 2390)) | ((3020 <= df['sic']) & (df['sic'] <= 3021)) | - ((3100 <= df['sic']) & (df['sic'] <= 3111)) | ((3130 <= df['sic']) & (df['sic'] <= 3131)) | - ((3140 <= df['sic']) & (df['sic'] <= 3149)) | ((3150 <= df['sic']) & (df['sic'] <= 3151)) | - ((3963 <= df['sic']) & (df['sic'] <= 3965)), - ((8000 <= df['sic']) & (df['sic'] <= 8099)), - ((3693 <= df['sic']) & (df['sic'] <= 3693)) | ((3840 <= df['sic']) & (df['sic'] <= 3849)) | - ((3850 <= df['sic']) & (df['sic'] <= 3851)), - ((2830 <= df['sic']) & (df['sic'] <= 2830)) | ((2831 <= df['sic']) & (df['sic'] <= 2831)) | - ((2833 <= df['sic']) & (df['sic'] <= 2833)) | ((2834 <= df['sic']) & (df['sic'] <= 2834)) | - ((2835 <= df['sic']) & (df['sic'] <= 2835)) | ((2836 <= df['sic']) & (df['sic'] <= 2836)), - ((2800 <= df['sic']) & (df['sic'] <= 2809)) | ((2810 <= df['sic']) & (df['sic'] <= 2819)) | - ((2820 <= df['sic']) & (df['sic'] <= 2829)) | ((2850 <= df['sic']) & (df['sic'] <= 2859)) | - ((2860 <= df['sic']) & (df['sic'] <= 2869)) | ((2870 <= df['sic']) & (df['sic'] <= 2879)) | - ((2890 <= df['sic']) & (df['sic'] <= 2899)), - ((3031 <= df['sic']) & (df['sic'] <= 3031)) | ((3041 <= df['sic']) & (df['sic'] <= 3041)) | - ((3050 <= df['sic']) & (df['sic'] <= 3053)) | ((3060 <= df['sic']) & (df['sic'] <= 3069)) | - ((3070 <= df['sic']) & (df['sic'] <= 3079)) | ((3080 <= df['sic']) & (df['sic'] <= 3089)) | - ((3090 <= df['sic']) & (df['sic'] <= 3099)), - ((2200 <= df['sic']) & (df['sic'] <= 2269)) | ((2270 <= df['sic']) & (df['sic'] <= 2279)) | - ((2280 <= df['sic']) & (df['sic'] <= 2284)) | ((2290 <= df['sic']) & (df['sic'] <= 2295)) | - ((2297 <= df['sic']) & (df['sic'] <= 2297)) | ((2298 <= df['sic']) & (df['sic'] <= 2298)) | - ((2299 <= df['sic']) & (df['sic'] <= 2299)) | ((2393 <= df['sic']) & (df['sic'] <= 2395)) | - ((2397 <= df['sic']) & (df['sic'] <= 2399)), - ((800 <= df['sic']) & (df['sic'] <= 899)) | ((2400 <= df['sic']) & (df['sic'] <= 2439)) | - ((2450 <= df['sic']) & (df['sic'] <= 2459)) | ((2490 <= df['sic']) & (df['sic'] <= 2499)) | - ((2660 <= df['sic']) & (df['sic'] <= 2661)) | ((2950 <= df['sic']) & (df['sic'] <= 2952)) | - ((3200 <= df['sic']) & (df['sic'] <= 3200)) | ((3210 <= df['sic']) & (df['sic'] <= 3211)) | - ((3240 <= df['sic']) & (df['sic'] <= 3241)) | ((3250 <= df['sic']) & (df['sic'] <= 3259)) | - ((3261 <= df['sic']) & (df['sic'] <= 3261)) | ((3264 <= df['sic']) & (df['sic'] <= 3264)) | - ((3270 <= df['sic']) & (df['sic'] <= 3275)) | ((3280 <= df['sic']) & (df['sic'] <= 3281)) | - ((3290 <= df['sic']) & (df['sic'] <= 3293)) | ((3295 <= df['sic']) & (df['sic'] <= 3299)) | - ((3420 <= df['sic']) & (df['sic'] <= 3429)) | ((3430 <= df['sic']) & (df['sic'] <= 3433)) | - ((3440 <= df['sic']) & (df['sic'] <= 3441)) | ((3442 <= df['sic']) & (df['sic'] <= 3442)) | - ((3446 <= df['sic']) & (df['sic'] <= 3446)) | ((3448 <= df['sic']) & (df['sic'] <= 3448)) | - ((3449 <= df['sic']) & (df['sic'] <= 3449)) | ((3450 <= df['sic']) & (df['sic'] <= 3451)) | - ((3452 <= df['sic']) & (df['sic'] <= 3452)) | ((3490 <= df['sic']) & (df['sic'] <= 3499)) | - ((3996 <= df['sic']) & (df['sic'] <= 3996)), - ((1500 <= df['sic']) & (df['sic'] <= 1511)) | ((1520 <= df['sic']) & (df['sic'] <= 1529)) | - ((1530 <= df['sic']) & (df['sic'] <= 1539)) | ((1540 <= df['sic']) & (df['sic'] <= 1549)) | - ((1600 <= df['sic']) & (df['sic'] <= 1699)) | ((1700 <= df['sic']) & (df['sic'] <= 1799)), - ((3300 <= df['sic']) & (df['sic'] <= 3300)) | ((3310 <= df['sic']) & (df['sic'] <= 3317)) | - ((3320 <= df['sic']) & (df['sic'] <= 3325)) | ((3330 <= df['sic']) & (df['sic'] <= 3339)) | - ((3340 <= df['sic']) & (df['sic'] <= 3341)) | ((3350 <= df['sic']) & (df['sic'] <= 3357)) | - ((3360 <= df['sic']) & (df['sic'] <= 3369)) | ((3370 <= df['sic']) & (df['sic'] <= 3379)) | - ((3390 <= df['sic']) & (df['sic'] <= 3399)), - ((3400 <= df['sic']) & (df['sic'] <= 3400)) | ((3443 <= df['sic']) & (df['sic'] <= 3443)) | - ((3444 <= df['sic']) & (df['sic'] <= 3444)) | ((3460 <= df['sic']) & (df['sic'] <= 3469)) | - ((3470 <= df['sic']) & (df['sic'] <= 3479)), - ((3510 <= df['sic']) & (df['sic'] <= 3519)) | ((3520 <= df['sic']) & (df['sic'] <= 3529)) | - ((3530 <= df['sic']) & (df['sic'] <= 3530)) | ((3531 <= df['sic']) & (df['sic'] <= 3531)) | - ((3532 <= df['sic']) & (df['sic'] <= 3532)) | ((3533 <= df['sic']) & (df['sic'] <= 3533)) | - ((3534 <= df['sic']) & (df['sic'] <= 3534)) | ((3535 <= df['sic']) & (df['sic'] <= 3535)) | - ((3536 <= df['sic']) & (df['sic'] <= 3536)) | ((3538 <= df['sic']) & (df['sic'] <= 3538)) | - ((3540 <= df['sic']) & (df['sic'] <= 3549)) | ((3550 <= df['sic']) & (df['sic'] <= 3559)) | - ((3560 <= df['sic']) & (df['sic'] <= 3569)) | ((3580 <= df['sic']) & (df['sic'] <= 3580)) | - ((3581 <= df['sic']) & (df['sic'] <= 3581)) | ((3582 <= df['sic']) & (df['sic'] <= 3582)) | - ((3585 <= df['sic']) & (df['sic'] <= 3585)) | ((3586 <= df['sic']) & (df['sic'] <= 3586)) | - ((3589 <= df['sic']) & (df['sic'] <= 3589)) | ((3590 <= df['sic']) & (df['sic'] <= 3599)), - ((3600 <= df['sic']) & (df['sic'] <= 3600)) | ((3610 <= df['sic']) & (df['sic'] <= 3613)) | - ((3620 <= df['sic']) & (df['sic'] <= 3621)) | ((3623 <= df['sic']) & (df['sic'] <= 3629)) | - ((3640 <= df['sic']) & (df['sic'] <= 3644)) | ((3645 <= df['sic']) & (df['sic'] <= 3645)) | - ((3646 <= df['sic']) & (df['sic'] <= 3646)) | ((3648 <= df['sic']) & (df['sic'] <= 3649)) | - ((3660 <= df['sic']) & (df['sic'] <= 3660)) | ((3690 <= df['sic']) & (df['sic'] <= 3690)) | - ((3691 <= df['sic']) & (df['sic'] <= 3692)) | ((3699 <= df['sic']) & (df['sic'] <= 3699)), - ((2296 <= df['sic']) & (df['sic'] <= 2296)) | ((2396 <= df['sic']) & (df['sic'] <= 2396)) | - ((3010 <= df['sic']) & (df['sic'] <= 3011)) | ((3537 <= df['sic']) & (df['sic'] <= 3537)) | - ((3647 <= df['sic']) & (df['sic'] <= 3647)) | ((3694 <= df['sic']) & (df['sic'] <= 3694)) | - ((3700 <= df['sic']) & (df['sic'] <= 3700)) | ((3710 <= df['sic']) & (df['sic'] <= 3710)) | - ((3711 <= df['sic']) & (df['sic'] <= 3711)) | ((3713 <= df['sic']) & (df['sic'] <= 3713)) | - ((3714 <= df['sic']) & (df['sic'] <= 3714)) | ((3715 <= df['sic']) & (df['sic'] <= 3715)) | - ((3716 <= df['sic']) & (df['sic'] <= 3716)) | ((3792 <= df['sic']) & (df['sic'] <= 3792)) | - ((3790 <= df['sic']) & (df['sic'] <= 3791)) | ((3799 <= df['sic']) & (df['sic'] <= 3799)), - ((3720 <= df['sic']) & (df['sic'] <= 3720)) | ((3721 <= df['sic']) & (df['sic'] <= 3721)) | - ((3723 <= df['sic']) & (df['sic'] <= 3724)) | ((3725 <= df['sic']) & (df['sic'] <= 3725)) | - ((3728 <= df['sic']) & (df['sic'] <= 3729)), - ((3730 <= df['sic']) & (df['sic'] <= 3731)) | ((3740 <= df['sic']) & (df['sic'] <= 3743)), - ((3760 <= df['sic']) & (df['sic'] <= 3769)) | ((3795 <= df['sic']) & (df['sic'] <= 3795)) | - ((3480 <= df['sic']) & (df['sic'] <= 3489)), - ((1040 <= df['sic']) & (df['sic'] <= 1049)), - ((1000 <= df['sic']) & (df['sic'] <= 1009)) | ((1010 <= df['sic']) & (df['sic'] <= 1019)) | - ((1020 <= df['sic']) & (df['sic'] <= 1029)) | ((1030 <= df['sic']) & (df['sic'] <= 1039)) | - ((1050 <= df['sic']) & (df['sic'] <= 1059)) | ((1060 <= df['sic']) & (df['sic'] <= 1069)) | - ((1070 <= df['sic']) & (df['sic'] <= 1079)) | ((1080 <= df['sic']) & (df['sic'] <= 1089)) | - ((1090 <= df['sic']) & (df['sic'] <= 1099)) | ((1100 <= df['sic']) & (df['sic'] <= 1119)) | - ((1400 <= df['sic']) & (df['sic'] <= 1499)), - ((1200 <= df['sic']) & (df['sic'] <= 1299)), - ((1300 <= df['sic']) & (df['sic'] <= 1300)) | ((1310 <= df['sic']) & (df['sic'] <= 1319)) | - ((1320 <= df['sic']) & (df['sic'] <= 1329)) | ((1330 <= df['sic']) & (df['sic'] <= 1339)) | - ((1370 <= df['sic']) & (df['sic'] <= 1379)) | ((1380 <= df['sic']) & (df['sic'] <= 1380)) | - ((1381 <= df['sic']) & (df['sic'] <= 1381)) | ((1382 <= df['sic']) & (df['sic'] <= 1382)) | - ((1389 <= df['sic']) & (df['sic'] <= 1389)) | ((2900 <= df['sic']) & (df['sic'] <= 2912)) | - ((2990 <= df['sic']) & (df['sic'] <= 2999)), - ((4900 <= df['sic']) & (df['sic'] <= 4900)) | ((4910 <= df['sic']) & (df['sic'] <= 4911)) | - ((4920 <= df['sic']) & (df['sic'] <= 4922)) | ((4923 <= df['sic']) & (df['sic'] <= 4923)) | - ((4924 <= df['sic']) & (df['sic'] <= 4925)) | ((4930 <= df['sic']) & (df['sic'] <= 4931)) | - ((4932 <= df['sic']) & (df['sic'] <= 4932)) | ((4939 <= df['sic']) & (df['sic'] <= 4939)) | - ((4940 <= df['sic']) & (df['sic'] <= 4942)), - ((4800 <= df['sic']) & (df['sic'] <= 4800)) | ((4810 <= df['sic']) & (df['sic'] <= 4813)) | - ((4820 <= df['sic']) & (df['sic'] <= 4822)) | ((4830 <= df['sic']) & (df['sic'] <= 4839)) | - ((4840 <= df['sic']) & (df['sic'] <= 4841)) | ((4880 <= df['sic']) & (df['sic'] <= 4889)) | - ((4890 <= df['sic']) & (df['sic'] <= 4890)) | ((4891 <= df['sic']) & (df['sic'] <= 4891)) | - ((4892 <= df['sic']) & (df['sic'] <= 4892)) | ((4899 <= df['sic']) & (df['sic'] <= 4899)), - ((7020 <= df['sic']) & (df['sic'] <= 7021)) | ((7030 <= df['sic']) & (df['sic'] <= 7033)) | - ((7200 <= df['sic']) & (df['sic'] <= 7200)) | ((7210 <= df['sic']) & (df['sic'] <= 7212)) | - ((7214 <= df['sic']) & (df['sic'] <= 7214)) | ((7215 <= df['sic']) & (df['sic'] <= 7216)) | - ((7217 <= df['sic']) & (df['sic'] <= 7217)) | ((7219 <= df['sic']) & (df['sic'] <= 7219)) | - ((7220 <= df['sic']) & (df['sic'] <= 7221)) | ((7230 <= df['sic']) & (df['sic'] <= 7231)) | - ((7240 <= df['sic']) & (df['sic'] <= 7241)) | ((7250 <= df['sic']) & (df['sic'] <= 7251)) | - ((7260 <= df['sic']) & (df['sic'] <= 7269)) | ((7270 <= df['sic']) & (df['sic'] <= 7290)) | - ((7291 <= df['sic']) & (df['sic'] <= 7291)) | ((7292 <= df['sic']) & (df['sic'] <= 7299)) | - ((7395 <= df['sic']) & (df['sic'] <= 7395)) | ((7500 <= df['sic']) & (df['sic'] <= 7500)) | - ((7520 <= df['sic']) & (df['sic'] <= 7529)) | ((7530 <= df['sic']) & (df['sic'] <= 7539)) | - ((7540 <= df['sic']) & (df['sic'] <= 7549)) | ((7600 <= df['sic']) & (df['sic'] <= 7600)) | - ((7620 <= df['sic']) & (df['sic'] <= 7620)) | ((7622 <= df['sic']) & (df['sic'] <= 7622)) | - ((7623 <= df['sic']) & (df['sic'] <= 7623)) | ((7629 <= df['sic']) & (df['sic'] <= 7629)) | - ((7630 <= df['sic']) & (df['sic'] <= 7631)) | ((7640 <= df['sic']) & (df['sic'] <= 7641)) | - ((7690 <= df['sic']) & (df['sic'] <= 7699)) | ((8100 <= df['sic']) & (df['sic'] <= 8199)) | - ((8200 <= df['sic']) & (df['sic'] <= 8299)) | ((8300 <= df['sic']) & (df['sic'] <= 8399)) | - ((8400 <= df['sic']) & (df['sic'] <= 8499)) | ((8600 <= df['sic']) & (df['sic'] <= 8699)) | - ((8800 <= df['sic']) & (df['sic'] <= 8899)) | ((7510 <= df['sic']) & (df['sic'] <= 7515)), - ((2750 <= df['sic']) & (df['sic'] <= 2759)) | ((3993 <= df['sic']) & (df['sic'] <= 3993)) | - ((7218 <= df['sic']) & (df['sic'] <= 7218)) | ((7300 <= df['sic']) & (df['sic'] <= 7300)) | - ((7310 <= df['sic']) & (df['sic'] <= 7319)) | ((7320 <= df['sic']) & (df['sic'] <= 7329)) | - ((7330 <= df['sic']) & (df['sic'] <= 7339)) | ((7340 <= df['sic']) & (df['sic'] <= 7342)) | - ((7349 <= df['sic']) & (df['sic'] <= 7349)) | ((7350 <= df['sic']) & (df['sic'] <= 7351)) | - ((7352 <= df['sic']) & (df['sic'] <= 7352)) | ((7353 <= df['sic']) & (df['sic'] <= 7353)) | - ((7359 <= df['sic']) & (df['sic'] <= 7359)) | ((7360 <= df['sic']) & (df['sic'] <= 7369)) | - ((7374 <= df['sic']) & (df['sic'] <= 7374)) | ((7376 <= df['sic']) & (df['sic'] <= 7376)) | - ((7377 <= df['sic']) & (df['sic'] <= 7377)) | ((7378 <= df['sic']) & (df['sic'] <= 7378)) | - ((7379 <= df['sic']) & (df['sic'] <= 7379)) | ((7380 <= df['sic']) & (df['sic'] <= 7380)) | - ((7381 <= df['sic']) & (df['sic'] <= 7382)) | ((7383 <= df['sic']) & (df['sic'] <= 7383)) | - ((7384 <= df['sic']) & (df['sic'] <= 7384)) | ((7385 <= df['sic']) & (df['sic'] <= 7385)) | - ((7389 <= df['sic']) & (df['sic'] <= 7390)) | ((7391 <= df['sic']) & (df['sic'] <= 7391)) | - ((7392 <= df['sic']) & (df['sic'] <= 7392)) | ((7393 <= df['sic']) & (df['sic'] <= 7393)) | - ((7394 <= df['sic']) & (df['sic'] <= 7394)) | ((7396 <= df['sic']) & (df['sic'] <= 7396)) | - ((7397 <= df['sic']) & (df['sic'] <= 7397)) | ((7399 <= df['sic']) & (df['sic'] <= 7399)) | - ((7519 <= df['sic']) & (df['sic'] <= 7519)) | ((8700 <= df['sic']) & (df['sic'] <= 8700)) | - ((8710 <= df['sic']) & (df['sic'] <= 8713)) | ((8720 <= df['sic']) & (df['sic'] <= 8721)) | - ((8730 <= df['sic']) & (df['sic'] <= 8734)) | ((8740 <= df['sic']) & (df['sic'] <= 8748)) | - ((8900 <= df['sic']) & (df['sic'] <= 8910)) | ((8911 <= df['sic']) & (df['sic'] <= 8911)) | - ((8920 <= df['sic']) & (df['sic'] <= 8999)) | ((4220 <= df['sic']) & (df['sic'] <= 4229)), - ((3570 <= df['sic']) & (df['sic'] <= 3579)) | ((3680 <= df['sic']) & (df['sic'] <= 3680)) | - ((3681 <= df['sic']) & (df['sic'] <= 3681)) | ((3682 <= df['sic']) & (df['sic'] <= 3682)) | - ((3683 <= df['sic']) & (df['sic'] <= 3683)) | ((3684 <= df['sic']) & (df['sic'] <= 3684)) | - ((3685 <= df['sic']) & (df['sic'] <= 3685)) | ((3686 <= df['sic']) & (df['sic'] <= 3686)) | - ((3687 <= df['sic']) & (df['sic'] <= 3687)) | ((3688 <= df['sic']) & (df['sic'] <= 3688)) | - ((3689 <= df['sic']) & (df['sic'] <= 3689)) | ((3695 <= df['sic']) & (df['sic'] <= 3695)), - ((7370 <= df['sic']) & (df['sic'] <= 7372)) | ((7375 <= df['sic']) & (df['sic'] <= 7375)) | - ((7373 <= df['sic']) & (df['sic'] <= 7373)), - ((3622 <= df['sic']) & (df['sic'] <= 3622)) | ((3661 <= df['sic']) & (df['sic'] <= 3661)) | - ((3662 <= df['sic']) & (df['sic'] <= 3662)) | ((3663 <= df['sic']) & (df['sic'] <= 3663)) | - ((3664 <= df['sic']) & (df['sic'] <= 3664)) | ((3665 <= df['sic']) & (df['sic'] <= 3665)) | - ((3666 <= df['sic']) & (df['sic'] <= 3666)) | ((3669 <= df['sic']) & (df['sic'] <= 3669)) | - ((3670 <= df['sic']) & (df['sic'] <= 3679)) | ((3810 <= df['sic']) & (df['sic'] <= 3810)) | - ((3812 <= df['sic']) & (df['sic'] <= 3812)), - ((3811 <= df['sic']) & (df['sic'] <= 3811)) | ((3820 <= df['sic']) & (df['sic'] <= 3820)) | - ((3821 <= df['sic']) & (df['sic'] <= 3821)) | ((3822 <= df['sic']) & (df['sic'] <= 3822)) | - ((3823 <= df['sic']) & (df['sic'] <= 3823)) | ((3824 <= df['sic']) & (df['sic'] <= 3824)) | - ((3825 <= df['sic']) & (df['sic'] <= 3825)) | ((3826 <= df['sic']) & (df['sic'] <= 3826)) | - ((3827 <= df['sic']) & (df['sic'] <= 3827)) | ((3829 <= df['sic']) & (df['sic'] <= 3829)) | - ((3830 <= df['sic']) & (df['sic'] <= 3839)), - ((2520 <= df['sic']) & (df['sic'] <= 2549)) | ((2600 <= df['sic']) & (df['sic'] <= 2639)) | - ((2670 <= df['sic']) & (df['sic'] <= 2699)) | ((2760 <= df['sic']) & (df['sic'] <= 2761)) | - ((3950 <= df['sic']) & (df['sic'] <= 3955)), - ((2440 <= df['sic']) & (df['sic'] <= 2449)) | ((2640 <= df['sic']) & (df['sic'] <= 2659)) | - ((3220 <= df['sic']) & (df['sic'] <= 3221)) | ((3410 <= df['sic']) & (df['sic'] <= 3412)), - ((4000 <= df['sic']) & (df['sic'] <= 4013)) | ((4040 <= df['sic']) & (df['sic'] <= 4049)) | - ((4100 <= df['sic']) & (df['sic'] <= 4100)) | ((4110 <= df['sic']) & (df['sic'] <= 4119)) | - ((4120 <= df['sic']) & (df['sic'] <= 4121)) | ((4130 <= df['sic']) & (df['sic'] <= 4131)) | - ((4140 <= df['sic']) & (df['sic'] <= 4142)) | ((4150 <= df['sic']) & (df['sic'] <= 4151)) | - ((4170 <= df['sic']) & (df['sic'] <= 4173)) | ((4190 <= df['sic']) & (df['sic'] <= 4199)) | - ((4200 <= df['sic']) & (df['sic'] <= 4200)) | ((4210 <= df['sic']) & (df['sic'] <= 4219)) | - ((4230 <= df['sic']) & (df['sic'] <= 4231)) | ((4240 <= df['sic']) & (df['sic'] <= 4249)) | - ((4400 <= df['sic']) & (df['sic'] <= 4499)) | ((4500 <= df['sic']) & (df['sic'] <= 4599)) | - ((4600 <= df['sic']) & (df['sic'] <= 4699)) | ((4700 <= df['sic']) & (df['sic'] <= 4700)) | - ((4710 <= df['sic']) & (df['sic'] <= 4712)) | ((4720 <= df['sic']) & (df['sic'] <= 4729)) | - ((4730 <= df['sic']) & (df['sic'] <= 4739)) | ((4740 <= df['sic']) & (df['sic'] <= 4749)) | - ((4780 <= df['sic']) & (df['sic'] <= 4780)) | ((4782 <= df['sic']) & (df['sic'] <= 4782)) | - ((4783 <= df['sic']) & (df['sic'] <= 4783)) | ((4784 <= df['sic']) & (df['sic'] <= 4784)) | - ((4785 <= df['sic']) & (df['sic'] <= 4785)) | ((4789 <= df['sic']) & (df['sic'] <= 4789)), - ((5000 <= df['sic']) & (df['sic'] <= 5000)) | ((5010 <= df['sic']) & (df['sic'] <= 5015)) | - ((5020 <= df['sic']) & (df['sic'] <= 5023)) | ((5030 <= df['sic']) & (df['sic'] <= 5039)) | - ((5040 <= df['sic']) & (df['sic'] <= 5042)) | ((5043 <= df['sic']) & (df['sic'] <= 5043)) | - ((5044 <= df['sic']) & (df['sic'] <= 5044)) | ((5045 <= df['sic']) & (df['sic'] <= 5045)) | - ((5046 <= df['sic']) & (df['sic'] <= 5046)) | ((5047 <= df['sic']) & (df['sic'] <= 5047)) | - ((5048 <= df['sic']) & (df['sic'] <= 5048)) | ((5049 <= df['sic']) & (df['sic'] <= 5049)) | - ((5050 <= df['sic']) & (df['sic'] <= 5059)) | ((5060 <= df['sic']) & (df['sic'] <= 5060)) | - ((5063 <= df['sic']) & (df['sic'] <= 5063)) | ((5064 <= df['sic']) & (df['sic'] <= 5064)) | - ((5065 <= df['sic']) & (df['sic'] <= 5065)) | ((5070 <= df['sic']) & (df['sic'] <= 5078)) | - ((5080 <= df['sic']) & (df['sic'] <= 5080)) | ((5081 <= df['sic']) & (df['sic'] <= 5081)) | - ((5082 <= df['sic']) & (df['sic'] <= 5082)) | ((5083 <= df['sic']) & (df['sic'] <= 5083)) | - ((5084 <= df['sic']) & (df['sic'] <= 5084)) | ((5085 <= df['sic']) & (df['sic'] <= 5085)) | - ((5086 <= df['sic']) & (df['sic'] <= 5087)) | ((5088 <= df['sic']) & (df['sic'] <= 5088)) | - ((5090 <= df['sic']) & (df['sic'] <= 5090)) | ((5091 <= df['sic']) & (df['sic'] <= 5092)) | - ((5093 <= df['sic']) & (df['sic'] <= 5093)) | ((5094 <= df['sic']) & (df['sic'] <= 5094)) | - ((5099 <= df['sic']) & (df['sic'] <= 5099)) | ((5100 <= df['sic']) & (df['sic'] <= 5100)) | - ((5110 <= df['sic']) & (df['sic'] <= 5113)) | ((5120 <= df['sic']) & (df['sic'] <= 5122)) | - ((5130 <= df['sic']) & (df['sic'] <= 5139)) | ((5140 <= df['sic']) & (df['sic'] <= 5149)) | - ((5150 <= df['sic']) & (df['sic'] <= 5159)) | ((5160 <= df['sic']) & (df['sic'] <= 5169)) | - ((5170 <= df['sic']) & (df['sic'] <= 5172)) | ((5180 <= df['sic']) & (df['sic'] <= 5182)) | - ((5190 <= df['sic']) & (df['sic'] <= 5199)), - ((5200 <= df['sic']) & (df['sic'] <= 5200)) | ((5210 <= df['sic']) & (df['sic'] <= 5219)) | - ((5220 <= df['sic']) & (df['sic'] <= 5229)) | ((5230 <= df['sic']) & (df['sic'] <= 5231)) | - ((5250 <= df['sic']) & (df['sic'] <= 5251)) | ((5260 <= df['sic']) & (df['sic'] <= 5261)) | - ((5270 <= df['sic']) & (df['sic'] <= 5271)) | ((5300 <= df['sic']) & (df['sic'] <= 5300)) | - ((5310 <= df['sic']) & (df['sic'] <= 5311)) | ((5320 <= df['sic']) & (df['sic'] <= 5320)) | - ((5330 <= df['sic']) & (df['sic'] <= 5331)) | ((5334 <= df['sic']) & (df['sic'] <= 5334)) | - ((5340 <= df['sic']) & (df['sic'] <= 5349)) | ((5390 <= df['sic']) & (df['sic'] <= 5399)) | - ((5400 <= df['sic']) & (df['sic'] <= 5400)) | ((5410 <= df['sic']) & (df['sic'] <= 5411)) | - ((5412 <= df['sic']) & (df['sic'] <= 5412)) | ((5420 <= df['sic']) & (df['sic'] <= 5429)) | - ((5430 <= df['sic']) & (df['sic'] <= 5439)) | ((5440 <= df['sic']) & (df['sic'] <= 5449)) | - ((5450 <= df['sic']) & (df['sic'] <= 5459)) | ((5460 <= df['sic']) & (df['sic'] <= 5469)) | - ((5490 <= df['sic']) & (df['sic'] <= 5499)) | ((5500 <= df['sic']) & (df['sic'] <= 5500)) | - ((5510 <= df['sic']) & (df['sic'] <= 5529)) | ((5530 <= df['sic']) & (df['sic'] <= 5539)) | - ((5540 <= df['sic']) & (df['sic'] <= 5549)) | ((5550 <= df['sic']) & (df['sic'] <= 5559)) | - ((5560 <= df['sic']) & (df['sic'] <= 5569)) | ((5570 <= df['sic']) & (df['sic'] <= 5579)) | - ((5590 <= df['sic']) & (df['sic'] <= 5599)) | ((5600 <= df['sic']) & (df['sic'] <= 5699)) | - ((5700 <= df['sic']) & (df['sic'] <= 5700)) | ((5710 <= df['sic']) & (df['sic'] <= 5719)) | - ((5720 <= df['sic']) & (df['sic'] <= 5722)) | ((5730 <= df['sic']) & (df['sic'] <= 5733)) | - ((5734 <= df['sic']) & (df['sic'] <= 5734)) | ((5735 <= df['sic']) & (df['sic'] <= 5735)) | - ((5736 <= df['sic']) & (df['sic'] <= 5736)) | ((5750 <= df['sic']) & (df['sic'] <= 5799)) | - ((5900 <= df['sic']) & (df['sic'] <= 5900)) | ((5910 <= df['sic']) & (df['sic'] <= 5912)) | - ((5920 <= df['sic']) & (df['sic'] <= 5929)) | ((5930 <= df['sic']) & (df['sic'] <= 5932)) | - ((5940 <= df['sic']) & (df['sic'] <= 5940)) | ((5941 <= df['sic']) & (df['sic'] <= 5941)) | - ((5942 <= df['sic']) & (df['sic'] <= 5942)) | ((5943 <= df['sic']) & (df['sic'] <= 5943)) | - ((5944 <= df['sic']) & (df['sic'] <= 5944)) | ((5945 <= df['sic']) & (df['sic'] <= 5945)) | - ((5946 <= df['sic']) & (df['sic'] <= 5946)) | ((5947 <= df['sic']) & (df['sic'] <= 5947)) | - ((5948 <= df['sic']) & (df['sic'] <= 5948)) | ((5949 <= df['sic']) & (df['sic'] <= 5949)) | - ((5950 <= df['sic']) & (df['sic'] <= 5959)) | ((5960 <= df['sic']) & (df['sic'] <= 5969)) | - ((5970 <= df['sic']) & (df['sic'] <= 5979)) | ((5980 <= df['sic']) & (df['sic'] <= 5989)) | - ((5990 <= df['sic']) & (df['sic'] <= 5990)) | ((5992 <= df['sic']) & (df['sic'] <= 5992)) | - ((5993 <= df['sic']) & (df['sic'] <= 5993)) | ((5994 <= df['sic']) & (df['sic'] <= 5994)) | - ((5995 <= df['sic']) & (df['sic'] <= 5995)) | ((5999 <= df['sic']) & (df['sic'] <= 5999)), - ((5800 <= df['sic']) & (df['sic'] <= 5819)) | ((5820 <= df['sic']) & (df['sic'] <= 5829)) | - ((5890 <= df['sic']) & (df['sic'] <= 5899)) | ((7000 <= df['sic']) & (df['sic'] <= 7000)) | - ((7010 <= df['sic']) & (df['sic'] <= 7019)) | ((7040 <= df['sic']) & (df['sic'] <= 7049)) | - ((7213 <= df['sic']) & (df['sic'] <= 7213)), - ((6000 <= df['sic']) & (df['sic'] <= 6000)) | ((6010 <= df['sic']) & (df['sic'] <= 6019)) | - ((6020 <= df['sic']) & (df['sic'] <= 6020)) | ((6021 <= df['sic']) & (df['sic'] <= 6021)) | - ((6022 <= df['sic']) & (df['sic'] <= 6022)) | ((6023 <= df['sic']) & (df['sic'] <= 6024)) | - ((6025 <= df['sic']) & (df['sic'] <= 6025)) | ((6026 <= df['sic']) & (df['sic'] <= 6026)) | - ((6027 <= df['sic']) & (df['sic'] <= 6027)) | ((6028 <= df['sic']) & (df['sic'] <= 6029)) | - ((6030 <= df['sic']) & (df['sic'] <= 6036)) | ((6040 <= df['sic']) & (df['sic'] <= 6059)) | - ((6060 <= df['sic']) & (df['sic'] <= 6062)) | ((6080 <= df['sic']) & (df['sic'] <= 6082)) | - ((6090 <= df['sic']) & (df['sic'] <= 6099)) | ((6100 <= df['sic']) & (df['sic'] <= 6100)) | - ((6110 <= df['sic']) & (df['sic'] <= 6111)) | ((6112 <= df['sic']) & (df['sic'] <= 6113)) | - ((6120 <= df['sic']) & (df['sic'] <= 6129)) | ((6130 <= df['sic']) & (df['sic'] <= 6139)) | - ((6140 <= df['sic']) & (df['sic'] <= 6149)) | ((6150 <= df['sic']) & (df['sic'] <= 6159)) | - ((6160 <= df['sic']) & (df['sic'] <= 6169)) | ((6170 <= df['sic']) & (df['sic'] <= 6179)) | - ((6190 <= df['sic']) & (df['sic'] <= 6199)), - ((6300 <= df['sic']) & (df['sic'] <= 6300)) | ((6310 <= df['sic']) & (df['sic'] <= 6319)) | - ((6320 <= df['sic']) & (df['sic'] <= 6329)) | ((6330 <= df['sic']) & (df['sic'] <= 6331)) | - ((6350 <= df['sic']) & (df['sic'] <= 6351)) | ((6360 <= df['sic']) & (df['sic'] <= 6361)) | - ((6370 <= df['sic']) & (df['sic'] <= 6379)) | ((6390 <= df['sic']) & (df['sic'] <= 6399)) | - ((6400 <= df['sic']) & (df['sic'] <= 6411)), - ((6500 <= df['sic']) & (df['sic'] <= 6500)) | ((6510 <= df['sic']) & (df['sic'] <= 6510)) | - ((6512 <= df['sic']) & (df['sic'] <= 6512)) | ((6513 <= df['sic']) & (df['sic'] <= 6513)) | - ((6514 <= df['sic']) & (df['sic'] <= 6514)) | ((6515 <= df['sic']) & (df['sic'] <= 6515)) | - ((6517 <= df['sic']) & (df['sic'] <= 6519)) | ((6520 <= df['sic']) & (df['sic'] <= 6529)) | - ((6530 <= df['sic']) & (df['sic'] <= 6531)) | ((6532 <= df['sic']) & (df['sic'] <= 6532)) | - ((6540 <= df['sic']) & (df['sic'] <= 6541)) | ((6550 <= df['sic']) & (df['sic'] <= 6553)) | - ((6590 <= df['sic']) & (df['sic'] <= 6599)) | ((6610 <= df['sic']) & (df['sic'] <= 6611)), - ((6200 <= df['sic']) & (df['sic'] <= 6299)) | ((6700 <= df['sic']) & (df['sic'] <= 6700)) | - ((6710 <= df['sic']) & (df['sic'] <= 6719)) | ((6720 <= df['sic']) & (df['sic'] <= 6722)) | - ((6723 <= df['sic']) & (df['sic'] <= 6723)) | ((6724 <= df['sic']) & (df['sic'] <= 6724)) | - ((6725 <= df['sic']) & (df['sic'] <= 6725)) | ((6726 <= df['sic']) & (df['sic'] <= 6726)) | - ((6730 <= df['sic']) & (df['sic'] <= 6733)) | ((6740 <= df['sic']) & (df['sic'] <= 6779)) | - ((6790 <= df['sic']) & (df['sic'] <= 6791)) | ((6792 <= df['sic']) & (df['sic'] <= 6792)) | - ((6793 <= df['sic']) & (df['sic'] <= 6793)) | ((6794 <= df['sic']) & (df['sic'] <= 6794)) | - ((6795 <= df['sic']) & (df['sic'] <= 6795)) | ((6798 <= df['sic']) & (df['sic'] <= 6798)) | - ((6799 <= df['sic']) & (df['sic'] <= 6799)), - ((4950 <= df['sic']) & (df['sic'] <= 4959)) | ((4960 <= df['sic']) & (df['sic'] <= 4961)) | - ((4970 <= df['sic']) & (df['sic'] <= 4971)) | ((4990 <= df['sic']) & (df['sic'] <= 4991))] - choicelist = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, - 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] - return np.select(condlist, choicelist, default=np.nan) - - -def fillna_atq(df_q, df_a): - # fina columns are na in df_q and exist in df_a - df_q_na_list = df_q.columns[df_q.isna().any()].tolist() - df_a_columns_list = df_a.columns.values.tolist() - list_temp = list(set(df_q_na_list) & set(df_a_columns_list)) - # remove mom columns, mom chars are same in annual and quarterly - na_columns_list = [] - for i in list_temp: - if re.match(r'mom.', i) is None: - na_columns_list.append(i) - # get annual columns from df_a - df_temp = df_a[na_columns_list].copy() - df_temp[['permno', 'date']] = df_a[['permno', 'date']].copy() - # rename annual columns in the form of 'chars_a' - for na_column in na_columns_list: - df_temp = df_temp.rename(columns={'%s' % na_column: '%s_a' % na_column}) - df_temp = df_temp.reset_index(drop=True) - # use annual chars to fill quarterly na - df_q = pd.merge(df_q, df_temp, how='left', on=['permno', 'date']) - for na_column in na_columns_list: - df_q['%s' % na_column] = np.where(df_q['%s' % na_column].isnull(), df_q['%s_a' % na_column], df_q['%s' % na_column]) - df_q = df_q.drop(['%s_a' % na_column], axis=1) - return df_q - - -def fillna_ind(df, method, ffi): - df_fill = pd.DataFrame() - na_columns_list = df.columns[df.isna().any()].tolist() - for na_column in na_columns_list: - if method == 'mean': - df_temp = df.groupby(['date', 'ffi%s' % ffi])['%s' % na_column].mean() - elif method == 'median': - df_temp = df.groupby(['date', 'ffi%s' % ffi])['%s' % na_column].median() - else: - None - df_fill = pd.concat([df_fill, df_temp], axis=1) - if method == 'mean': - df_fill = df_fill.rename(columns={'%s' % na_column: '%s_mean' % na_column}) - elif method == 'median': - df_fill = df_fill.rename(columns={'%s' % na_column: '%s_median' % na_column}) - else: - None - df_fill = df_fill.reset_index() - # reset multiple index to date and ffi code - df_fill['index'] = df_fill['index'].astype(str) - index_temp = df_fill['index'].str.split(',', expand=True) - index_temp.columns = ['date', 'ffi%s' % ffi] - index_temp['date'] = index_temp['date'].str.strip('(Timestamp(\' \')') - index_temp['ffi%s' % ffi] = index_temp['ffi%s' % ffi].str.strip(')') - df_fill[['date', 'ffi%s' % ffi]] = index_temp[['date', 'ffi%s' % ffi]] - df_fill = df_fill.drop(['index'], axis=1) - df_fill['date'] = pd.to_datetime(df_fill['date']) - df_fill['ffi49'] = df_fill['ffi49'].astype(int) - # fill na - df = pd.merge(df, df_fill, how='left', on=['date', 'ffi%s' % ffi]) - for na_column in na_columns_list: - if method == 'mean': - df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column]) - df = df.drop(['%s_mean' % na_column], axis=1) - elif method == 'median': - df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_median' % na_column]) - df = df.drop(['%s_median' % na_column], axis=1) - else: - None - return df - - -def fillna_all(df, method): - df_fill = pd.DataFrame() - na_columns_list = df.columns[df.isna().any()].tolist() - for na_column in na_columns_list: - if method == 'mean': - df_temp = df.groupby(['date'])['%s' % na_column].mean() - elif method == 'median': - df_temp = df.groupby(['date'])['%s' % na_column].median() - else: - None - df_fill = pd.concat([df_fill, df_temp], axis=1) - if method == 'mean': - df_fill = df_fill.rename(columns={'%s' % na_column: '%s_mean' % na_column}) - elif method == 'median': - df_fill = df_fill.rename(columns={'%s' % na_column: '%s_median' % na_column}) - else: - None - df_fill = df_fill.reset_index() - # reset multiple index to date and ffi code - df_fill['index'] = df_fill['index'].astype(str) - index_temp = df_fill['index'].str.split(',', expand=True) - index_temp.columns = ['date'] - index_temp['date'] = index_temp['date'].str.strip('(Timestamp(\' \')') - df_fill[['date']] = index_temp[['date']] - df_fill = df_fill.drop(['index'], axis=1) - df_fill['date'] = pd.to_datetime(df_fill['date']) - # fill na - df = pd.merge(df, df_fill, how='left', on='date') - for na_column in na_columns_list: - if method == 'mean': - df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column]) - df = df.drop(['%s_mean' % na_column], axis=1) - elif method == 'median': - df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_median' % na_column]) - df = df.drop(['%s_median' % na_column], axis=1) - else: - None - return df - - -def standardize(df): - # exclude the the information columns - col_names = df.columns.values.tolist() - list_to_remove = ['permno', 'date', 'date', 'datadate', 'gvkey', 'sic', 'count', 'exchcd', 'shrcd', 'ffi49', 'ret', - 'retadj', 'retx', 'lag_me'] - col_names = list(set(col_names).difference(set(list_to_remove))) - for col_name in tqdm(col_names): - print('processing %s' % col_name) - # count the non-missing number of factors, we only count non-missing values - unique_count = df.dropna(subset=['%s' % col_name]).groupby(['date'])['%s' % col_name].unique().apply(len) - unique_count = pd.DataFrame(unique_count).reset_index() - unique_count.columns = ['date', 'count'] - df = pd.merge(df, unique_count, how='left', on=['date']) - # ranking, and then standardize the data - df['%s_rank' % col_name] = df.groupby(['date'])['%s' % col_name].rank(method='dense') - df['rank_%s' % col_name] = (df['%s_rank' % col_name] - 1) / (df['count'] - 1) * 2 - 1 - df = df.drop(['%s_rank' % col_name, '%s' % col_name, 'count'], axis=1) - df = df.fillna(0) - return df diff --git a/char60/iclink.py b/char60/iclink.py deleted file mode 100755 index c630697..0000000 --- a/char60/iclink.py +++ /dev/null @@ -1,241 +0,0 @@ -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -from pandasql import * -from fuzzywuzzy import fuzz - -# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/python-replications/linking-ibes-and-crsp-data-python/ -##################################### -# ICLINK: Link CRSP and IBES # -# June 2019 # -# Qingyi (Freda) Song Drechsler # -##################################### - -# This program replicates the SAS macro ICLINK -# to create a linking table between CRSP and IBES -# Output is a score reflecting the quality of the link -# Score = 0 (best link) to Score = 6 (worst link) -# -# More explanation on score system: -# - 0: BEST match: using (cusip, cusip dates and company names) -# or (exchange ticker, company names and 6-digit cusip) -# - 1: Cusips and cusip dates match but company names do not match -# - 2: Cusips and company names match but cusip dates do not match -# - 3: Cusips match but cusip dates and company names do not match -# - 4: tickers and 6-digit cusips match but company names do not match -# - 5: tickers and company names match but 6-digit cusips do not match -# - 6: tickers match but company names and 6-digit cusips do not match - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -######################### -# Step 1: Link by CUSIP # -######################### - -# 1.1 IBES: Get the list of IBES Tickers for US firms in IBES -_ibes1 = conn.raw_sql(""" - select ticker, cusip, cname, sdates from ibes.id - where usfirm=1 and cusip != '' - """) - -# Create first and last 'start dates' for a given cusip -# Use agg min and max to find the first and last date per group -# then rename to fdate and ldate respectively - -_ibes1_date = _ibes1.groupby(['ticker','cusip']).sdates.agg(['min', 'max'])\ -.reset_index().rename(columns={'min':'fdate', 'max':'ldate'}) - -# merge fdate ldate back to _ibes1 data -_ibes2 = pd.merge(_ibes1, _ibes1_date,how='left', on =['ticker','cusip']) -_ibes2 = _ibes2.sort_values(by=['ticker','cusip','sdates']) - -# keep only the most recent company name -# determined by having sdates = ldate -_ibes2 = _ibes2.loc[_ibes2.sdates == _ibes2.ldate].drop(['sdates'], axis=1) - -# 1.2 CRSP: Get all permno-ncusip combinations -_crsp1 = conn.raw_sql(""" - select permno, ncusip, comnam, namedt, nameenddt - from crsp.stocknames - where ncusip != '' - """) - -# first namedt -_crsp1_fnamedt = _crsp1.groupby(['permno','ncusip']).namedt.min().reset_index() - -# last nameenddt -_crsp1_lnameenddt = _crsp1.groupby(['permno','ncusip']).nameenddt.max().reset_index() - -# merge both -_crsp1_dtrange = pd.merge(_crsp1_fnamedt, _crsp1_lnameenddt, \ - on = ['permno','ncusip'], how='inner') - -# replace namedt and nameenddt with the version from the dtrange -_crsp1 = _crsp1.drop(['namedt'],axis=1).rename(columns={'nameenddt':'enddt'}) -_crsp2 = pd.merge(_crsp1, _crsp1_dtrange, on =['permno','ncusip'], how='inner') - -# keep only most recent company name -_crsp2 = _crsp2.loc[_crsp2.enddt ==_crsp2.nameenddt].drop(['enddt'], axis=1) - -# 1.3 Create CUSIP Link Table - -# Link by full cusip, company names and dates -_link1_1 = pd.merge(_ibes2, _crsp2, how='inner', left_on='cusip', right_on='ncusip')\ -.sort_values(['ticker','permno','ldate']) - -# Keep link with most recent company name -_link1_1_tmp = _link1_1.groupby(['ticker','permno']).ldate.max().reset_index() -_link1_2 = pd.merge(_link1_1, _link1_1_tmp, how='inner', on =['ticker', 'permno', 'ldate']) - - -# Calculate name matching ratio using FuzzyWuzzy - -# Note: fuzz ratio = 100 -> match perfectly -# fuzz ratio = 0 -> do not match at all - -# Comment: token_set_ratio is more flexible in matching the strings: -# fuzz.token_set_ratio('AMAZON.COM INC', 'AMAZON COM INC') -# returns value of 100 - -# fuzz.ratio('AMAZON.COM INC', 'AMAZON COM INC') -# returns value of 93 - -_link1_2['name_ratio'] = _link1_2.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1) - -# Note on parameters: -# The following parameters are chosen to mimic the SAS macro %iclink -# In %iclink, name_dist < 30 is assigned score = 0 -# where name_dist=30 is roughly 90% percentile in total distribution -# and higher name_dist means more different names. -# In name_ratio, I mimic this by choosing 10% percentile as cutoff to assign -# score = 0 - -# 10% percentile of the company name distance -name_ratio_p10 = _link1_2.name_ratio.quantile(0.10) - -# Function to assign score for companies matched by: -# full cusip and passing name_ratio -# or meeting date range requirement - -def score1(row): - if (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']) & (row['name_ratio'] >= name_ratio_p10): - score = 0 - elif (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']): - score = 1 - elif row['name_ratio'] >= name_ratio_p10: - score = 2 - else: - score = 3 - return score - -# assign size portfolio -_link1_2['score']=_link1_2.apply(score1, axis=1) -_link1_2 = _link1_2[['ticker','permno','cname','comnam','name_ratio','score']] -_link1_2 = _link1_2.drop_duplicates() - -########################## -# Step 2: Link by TICKER # -########################## - -# Find links for the remaining unmatched cases using Exchange Ticker - -# Identify remaining unmatched cases -_nomatch1 = pd.merge(_ibes2[['ticker']], _link1_2[['permno','ticker']], on='ticker', how='left') -_nomatch1 = _nomatch1.loc[_nomatch1.permno.isnull()].drop(['permno'], axis=1).drop_duplicates() - -# Add IBES identifying information - -ibesid = conn.raw_sql(""" select ticker, cname, oftic, sdates, cusip from ibes.id """) -ibesid = ibesid.loc[ibesid.oftic.notna()] - -_nomatch2 = pd.merge(_nomatch1, ibesid, how='inner', on=['ticker']) - -# Create first and last 'start dates' for Exchange Tickers -# Label date range variables and keep only most recent company name - -_nomatch3 = _nomatch2.groupby(['ticker', 'oftic']).sdates.agg(['min', 'max'])\ -.reset_index().rename(columns={'min':'fdate', 'max':'ldate'}) - -_nomatch3 = pd.merge(_nomatch2, _nomatch3, how='left', on=['ticker','oftic']) - -_nomatch3 = _nomatch3.loc[_nomatch3.sdates == _nomatch3.ldate] - -# Get entire list of CRSP stocks with Exchange Ticker information - -_crsp_n1 = conn.raw_sql(""" select ticker, comnam, permno, ncusip, namedt, nameenddt - from crsp.stocknames """) - -_crsp_n1 = _crsp_n1.loc[_crsp_n1.ticker.notna()].sort_values(by=['permno','ticker','namedt']) - -# Arrange effective dates for link by Exchange Ticker - -_crsp_n1_namedt = _crsp_n1.groupby(['permno','ticker']).namedt.min().reset_index().rename(columns={'min':'namedt'}) -_crsp_n1_nameenddt = _crsp_n1.groupby(['permno','ticker']).nameenddt.max().reset_index().rename(columns={'max':'nameenddt'}) - -_crsp_n1_dt = pd.merge(_crsp_n1_namedt, _crsp_n1_nameenddt, how = 'inner', on=['permno','ticker']) - -_crsp_n1 = _crsp_n1.rename(columns={'namedt': 'namedt_ind', 'nameenddt':'nameenddt_ind'}) - -_crsp_n2 = pd.merge(_crsp_n1, _crsp_n1_dt, how ='left', on = ['permno','ticker']) - -_crsp_n2 = _crsp_n2.rename(columns={'ticker':'crsp_ticker'}) -_crsp_n2 = _crsp_n2.loc[_crsp_n2.nameenddt_ind == _crsp_n2.nameenddt].drop(['namedt_ind', 'nameenddt_ind'], axis=1) - -# Merge remaining unmatched cases using Exchange Ticker -# Note: Use ticker date ranges as exchange tickers are reused overtime - -_link2_1 = pd.merge(_nomatch3, _crsp_n2, how='inner', left_on=['oftic'], right_on=['crsp_ticker']) -_link2_1 = _link2_1.loc[(_link2_1.ldate>=_link2_1.namedt) & (_link2_1.fdate<=_link2_1.nameenddt)] - - -# Score using company name using 6-digit CUSIP and company name spelling distance -_link2_1['name_ratio'] = _link2_1.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1) - -_link2_2 = _link2_1 -_link2_2['cusip6'] = _link2_2.apply(lambda x: x.cusip[:6], axis=1) -_link2_2['ncusip6'] = _link2_2.apply(lambda x: x.ncusip[:6], axis=1) - -# Score using company name using 6-digit CUSIP and company name spelling distance - -def score2(row): - if (row['cusip6']==row['ncusip6']) & (row['name_ratio'] >= name_ratio_p10): - score = 0 - elif (row['cusip6']==row['ncusip6']): - score = 4 - elif row['name_ratio'] >= name_ratio_p10: - score = 5 - else: - score = 6 - return score - -# assign size portfolio -_link2_2['score']=_link2_2.apply(score2, axis=1) - -# Some companies may have more than one TICKER-PERMNO link -# so re-sort and keep the case (PERMNO & Company name from CRSP) -# that gives the lowest score for each IBES TICKER - -_link2_2 = _link2_2[['ticker','permno','cname','comnam', 'name_ratio', 'score']].sort_values(by=['ticker','score']) -_link2_2_score = _link2_2.groupby(['ticker']).score.min().reset_index() - -_link2_3 = pd.merge(_link2_2, _link2_2_score, how='inner', on=['ticker', 'score']) -_link2_3 = _link2_3[['ticker','permno','cname','comnam','score']].drop_duplicates() - -##################################### -# Step 3: Finalize LInks and Scores # -##################################### -# Combine the output from both linking procedures. Store the output data for future usage - -iclink = _link1_2.append(_link2_3) - -# Storing iclink for other program usage -import pickle as pkl - -with open('iclink.pkl', 'wb') as f: - pkl.dump(iclink, f) \ No newline at end of file diff --git a/char60/ill.py b/char60/ill.py deleted file mode 100755 index df1f871..0000000 --- a/char60/ill.py +++ /dev/null @@ -1,158 +0,0 @@ -# Fama & French 3 factors residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.ret, a.vol, a.prc - from crsp.dsf as a - where a.date > '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate residual # -###################### - - -def get_baspread(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - if temp['permno'].count() < 21: - pass - else: - index = temp.tail(1).index - X = pd.DataFrame() - X[['vol', 'prc', 'ret']] = temp[['vol', 'prc', 'ret']] - ill = (abs(X['ret']) / abs(X['prc'])*X['vol']).mean() - df.loc[index, 'ill'] = ill - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['ill']) # drop NA due to rolling -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'ill']] - -with open('ill.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/impute_rank_output_bchmk_60.py b/char60/impute_rank_output_bchmk_60.py deleted file mode 100755 index dd7a242..0000000 --- a/char60/impute_rank_output_bchmk_60.py +++ /dev/null @@ -1,164 +0,0 @@ -import pandas as pd -import pickle as pkl -import numpy as np -from tqdm import tqdm -from functions import * - -#################### -# All Stocks # -#################### -with open('chars_q_raw.pkl', 'rb') as f: - chars_q = pkl.load(f) - -chars_q = chars_q.dropna(subset=['permno']) -chars_q[['permno', 'gvkey']] = chars_q[['permno', 'gvkey']].astype(int) -chars_q['jdate'] = pd.to_datetime(chars_q['jdate']) -chars_q = chars_q.drop_duplicates(['permno', 'jdate']) - -with open('chars_a_raw.pkl', 'rb') as f: - chars_a = pkl.load(f) - -chars_a = chars_a.dropna(subset=['permno']) -chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int) -chars_a['jdate'] = pd.to_datetime(chars_a['jdate']) -chars_a = chars_a.drop_duplicates(['permno', 'jdate']) - -# information list -obs_var_list = ['gvkey', 'permno', 'jdate', 'sic', 'ret', 'retx', 'retadj', 'exchcd', 'shrcd'] -# characteristics with quarterly and annual frequency at the same time -accounting_var_list = ['datadate', 'acc', 'bm', 'agr', 'alm', 'ato', 'cash', 'cashdebt', 'cfp', 'chcsho', 'chpm', - 'chtx', 'depr', 'ep', 'gma', 'grltnoa', 'lev', 'lgr', 'ni', 'noa', 'op', 'pctacc', 'pm', - 'rd_sale', 'rdm', 'rna', 'roa', 'roe', 'rsup', 'sgr', 'sp'] -a_var_list = ['a_'+i for i in accounting_var_list] -q_var_list = ['q_'+i for i in accounting_var_list] -# annual frequency only list -a_only_list = ['adm', 'bm_ia', 'herf', 'hire', 'me_ia'] -# quarterly frequency only list -q_only_list = ['abr', 'sue', 'cinvest', 'nincr', 'pscore', - # 'turn', 'dolvol' - ] -# monthly frequency only list -m_var_list = ['baspread', 'beta', 'ill', 'maxret', 'mom12m', 'mom1m', 'mom36m', 'mom60m', 'mom6m', 're', 'rvar_capm', - 'rvar_ff3', 'rvar_mean', 'seas1a', 'std_dolvol', 'std_turn', 'zerotrade', 'me', 'dy', - 'turn', 'dolvol' # need to rerun the accounting to put them in to char_a - ] - -df_a = chars_a[obs_var_list + accounting_var_list + a_only_list + m_var_list] -df_a.columns = obs_var_list + a_var_list + a_only_list + m_var_list -df_a = df_a.sort_values(obs_var_list) - -df_q = chars_q[obs_var_list + accounting_var_list + q_only_list] -df_q.columns = obs_var_list + q_var_list + q_only_list -# drop the same information columns for merging -df_q = df_q.drop(['sic', 'ret', 'retx', 'retadj', 'exchcd', 'shrcd'], axis=1) - -df = df_a.merge(df_q, how='left', on=['gvkey', 'jdate', 'permno']) - -# first element in accounting_var_list is datadate -for i in tqdm(accounting_var_list[1:]): - print('processing %s' % i) - a = 'a_'+i - q = 'q_'+i - t1 = 'tmp1_'+i - t2 = 'tmp2_'+i - t3 = 'tmp3_'+i - t4 = 'tmp4_'+i - t5 = 'tmp5_'+i - - # tmp1: if the annual variable is available - df[t1] = np.where(df[a].isna(), False, True) - # tmp2: if the quarterly variable is available - df[t2] = np.where(df[q].isna(), False, True) - # tmp3: both - df[t3] = df[t1] & df[t2] - # tmp4: latest one - df[t4] = np.where(df['q_datadate'] < df['a_datadate'], df[a], df[q]) - # available one - df[t5] = np.where(df[t1], df[a], df[q]) - # final - df[i] = np.where(df[t3], df[t4], df[t5]) - df = df.drop([a, q, t1, t2, t3, t4, t5], axis=1) - -# drop the datadate of different frequency -df = df.drop(['a_datadate', 'q_datadate'], axis=1) - -# drop optional variables, you can adjust it by your selection -df = df.drop(['ret', 'retx'], axis=1) -df = df.rename(columns={'retadj': 'ret'}) # retadj is return adjusted by dividend -df['ret'] = df.groupby(['permno'])['ret'].shift(-1) # we shift return in t period to t+1 for prediction -df['date'] = df.groupby(['permno'])['jdate'].shift(-1) # date is return date, jdate is predictor date -df = df.drop(['jdate'], axis=1) # now we only keep the date of return -df = df.dropna(subset=['ret']).reset_index(drop=True) - -# save raw data -with open('chars60_raw_no_impute.pkl', 'wb') as f: - pkl.dump(df, f, protocol=4) - -# impute missing values, you can choose different func form functions.py, such as ffi49/ffi10 -df_impute = df.copy() -df_impute['sic'] = df_impute['sic'].astype(int) -df_impute['date'] = pd.to_datetime(df_impute['date']) - -df_impute['ffi49'] = ffi49(df_impute) -df_impute['ffi49'] = df_impute['ffi49'].fillna(49) # we treat na in ffi49 as 'other' -df_impute['ffi49'] = df_impute['ffi49'].astype(int) - -# there are two ways to impute: industrial median or mean -df_impute = fillna_ind(df_impute, method='median', ffi=49) - -df_impute = fillna_all(df_impute, method='median') -df_impute['re'] = df_impute['re'].fillna(0) # re use IBES database, there are lots of missing data - -df_impute['year'] = df_impute['date'].dt.year -df_impute = df_impute[df_impute['year'] >= 1972] -df_impute = df_impute.drop(['year'], axis=1) - -with open('chars60_raw_imputed.pkl', 'wb') as f: - pkl.dump(df_impute, f, protocol=4) - -# standardize raw data -df_rank = df.copy() -df_rank['lag_me'] = df_rank['me'] -df_rank = standardize(df_rank) -df_rank['year'] = df_rank['date'].dt.year -df_rank = df_rank[df_rank['year'] >= 1972] -df_rank = df_rank.drop(['year'], axis=1) -df_rank['log_me'] = np.log(df_rank['lag_me']) - -with open('chars60_rank_no_impute.pkl', 'wb') as f: - pkl.dump(df_rank, f, protocol=4) - -# standardize imputed data -df_rank = df_impute.copy() -df_rank['lag_me'] = df_rank['me'] -df_rank = standardize(df_rank) -df_rank['year'] = df_rank['date'].dt.year -df_rank = df_rank[df_rank['year'] >= 1972] -df_rank = df_rank.drop(['year'], axis=1) -df_rank['log_me'] = np.log(df_rank['lag_me']) - -with open('chars60_rank_imputed.pkl', 'wb') as f: - pkl.dump(df_rank, f, protocol=4) - - -#################### -# SP1500 # -#################### -with open('/home/jianxinma/chars/data/sp1500_impute_benchmark.pkl', 'rb') as f: - sp1500_index = pkl.load(f) - -sp1500_index = sp1500_index[['gvkey', 'date']] - -sp1500_impute = pd.merge(sp1500_index, df_impute, how='left', on=['gvkey', 'date']) - -# for test -# test = sp1500_rank.groupby(['jdate'])['gvkey'].nunique() - -with open('sp1500_impute_60.pkl', 'wb') as f: - pkl.dump(sp1500_impute, f, protocol=4) - -# standardize characteristics -sp1500_rank = pd.merge(sp1500_index, df_rank, how='left', on=['gvkey', 'date']) - -with open('sp1500_rank_60.pkl', 'wb') as f: - pkl.dump(sp1500_rank, f, protocol=4) diff --git a/char60/maxret_d.py b/char60/maxret_d.py deleted file mode 100755 index 69d208f..0000000 --- a/char60/maxret_d.py +++ /dev/null @@ -1,158 +0,0 @@ -# Fama & French 3 factors residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.ret - from crsp.dsf as a - where a.date > '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate residual # -###################### - - -def get_baspread(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - if temp['permno'].count() < 21: - pass - else: - index = temp.tail(1).index - X = pd.DataFrame() - X[['ret']] = temp[['ret']] - maxret = X['ret'].max() - df.loc[index, 'maxret'] = maxret - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['maxret']) # drop NA due to rolling -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'maxret']] - -with open('maxret.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/merge_chars_60.py b/char60/merge_chars_60.py deleted file mode 100755 index 763e9e6..0000000 --- a/char60/merge_chars_60.py +++ /dev/null @@ -1,294 +0,0 @@ -# Since some firms only have annual recording before 80s, we need to use annual data as merging benchmark in case -# there are some recordings are missing - -import pandas as pd -import pickle as pkl -from pandas.tseries.offsets import * - -with open('chars_a_60.pkl', 'rb') as f: - chars_a = pkl.load(f) - -chars_a = chars_a.dropna(subset=['permno']) -chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int) -chars_a['jdate'] = pd.to_datetime(chars_a['jdate']) -chars_a = chars_a.drop_duplicates(['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/beta.pkl', 'rb') as f: - beta = pkl.load(f) - -beta['permno'] = beta['permno'].astype(int) -beta['jdate'] = pd.to_datetime(beta['date']) + MonthEnd(0) -beta = beta[['permno', 'jdate', 'beta']] -beta = beta.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, beta, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/rvar_capm.pkl', 'rb') as f: - rvar_capm = pkl.load(f) - -rvar_capm['permno'] = rvar_capm['permno'].astype(int) -rvar_capm['jdate'] = pd.to_datetime(rvar_capm['date']) + MonthEnd(0) -rvar_capm = rvar_capm[['permno', 'jdate', 'rvar_capm']] -rvar_capm = rvar_capm.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, rvar_capm, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/rvar_mean.pkl', 'rb') as f: - rvar_mean = pkl.load(f) - -rvar_mean['permno'] = rvar_mean['permno'].astype(int) -rvar_mean['jdate'] = pd.to_datetime(rvar_mean['date']) + MonthEnd(0) -rvar_mean = rvar_mean[['permno', 'jdate', 'rvar_mean']] -rvar_mean = rvar_mean.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, rvar_mean, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/rvar_ff3.pkl', 'rb') as f: - rvar_ff3 = pkl.load(f) - -rvar_ff3['permno'] = rvar_ff3['permno'].astype(int) -rvar_ff3['jdate'] = pd.to_datetime(rvar_ff3['date']) + MonthEnd(0) -rvar_ff3 = rvar_ff3[['permno', 'jdate', 'rvar_ff3']] -rvar_ff3 = rvar_ff3.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, rvar_ff3, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/sue.pkl', 'rb') as f: - sue = pkl.load(f) - -sue['permno'] = sue['permno'].astype(int) -sue['jdate'] = pd.to_datetime(sue['date']) + MonthEnd(0) -sue = sue[['permno', 'jdate', 'sue']] -sue = sue.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, sue, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/re.pkl', 'rb') as f: - re = pkl.load(f) - -re['permno'] = re['permno'].astype(int) -re['jdate'] = pd.to_datetime(re['date']) + MonthEnd(0) -re = re[['permno', 'jdate', 're']] -re = re.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, re, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/abr.pkl', 'rb') as f: - abr = pkl.load(f) - -abr['permno'] = abr['permno'].astype(int) -abr['jdate'] = pd.to_datetime(abr['date']) + MonthEnd(0) -abr = abr[['permno', 'jdate', 'abr']] -abr = abr.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, abr, how='left', on=['permno', 'jdate']) - -with open('baspread.pkl', 'rb') as f: - baspread = pkl.load(f) - -baspread['permno'] = baspread['permno'].astype(int) -baspread['jdate'] = pd.to_datetime(baspread['date']) + MonthEnd(0) -baspread = baspread[['permno', 'jdate', 'baspread']] -baspread = baspread.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, baspread, how='left', on=['permno', 'jdate']) - -with open('maxret.pkl', 'rb') as f: - maxret = pkl.load(f) - -maxret['permno'] = maxret['permno'].astype(int) -maxret['jdate'] = pd.to_datetime(maxret['date']) + MonthEnd(0) -maxret = maxret[['permno', 'jdate', 'maxret']] -maxret = maxret.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, maxret, how='left', on=['permno', 'jdate']) - -with open('std_dolvol.pkl', 'rb') as f: - std_dolvol = pkl.load(f) - -std_dolvol['permno'] = std_dolvol['permno'].astype(int) -std_dolvol['jdate'] = pd.to_datetime(std_dolvol['date']) + MonthEnd(0) -std_dolvol = std_dolvol[['permno', 'jdate', 'std_dolvol']] -std_dolvol = std_dolvol.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, std_dolvol, how='left', on=['permno', 'jdate']) - -with open('ill.pkl', 'rb') as f: - ill = pkl.load(f) - -ill['permno'] = ill['permno'].astype(int) -ill['jdate'] = pd.to_datetime(ill['date']) + MonthEnd(0) -ill = ill[['permno', 'jdate', 'ill']] -ill = ill.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, ill, how='left', on=['permno', 'jdate']) - -with open('std_turn.pkl', 'rb') as f: - std_turn = pkl.load(f) - -std_turn['permno'] = std_turn['permno'].astype(int) -std_turn['jdate'] = pd.to_datetime(std_turn['date']) + MonthEnd(0) -std_turn = std_turn[['permno', 'jdate', 'std_turn']] -std_turn = std_turn.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, std_turn, how='left', on=['permno', 'jdate']) - -with open('zerotrade.pkl', 'rb') as f: - zerotrade = pkl.load(f) - -zerotrade['permno'] = zerotrade['permno'].astype(int) -zerotrade['jdate'] = pd.to_datetime(zerotrade['date']) + MonthEnd(0) -zerotrade = zerotrade[['permno', 'jdate', 'zerotrade']] -zerotrade = zerotrade.drop_duplicates(['permno', 'jdate']) - -chars_a = pd.merge(chars_a, zerotrade, how='left', on=['permno', 'jdate']) - -# save data -with open('chars_a_raw.pkl', 'wb') as f: - pkl.dump(chars_a, f, protocol=4) - -######################################################################################################################## -# In order to keep the naming tidy, we need to make another chars_q_raw, which is just a temporary dataframe # -######################################################################################################################## - -with open('chars_q_60.pkl', 'rb') as f: - chars_q = pkl.load(f) - -chars_q = chars_q.dropna(subset=['permno']) -chars_q[['permno', 'gvkey']] = chars_q[['permno', 'gvkey']].astype(int) -chars_q['jdate'] = pd.to_datetime(chars_q['jdate']) -chars_q = chars_q.drop_duplicates(['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/beta.pkl', 'rb') as f: - beta = pkl.load(f) - -beta['permno'] = beta['permno'].astype(int) -beta['jdate'] = pd.to_datetime(beta['date']) + MonthEnd(0) -beta = beta[['permno', 'jdate', 'beta']] -beta = beta.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, beta, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/rvar_capm.pkl', 'rb') as f: - rvar_capm = pkl.load(f) - -rvar_capm['permno'] = rvar_capm['permno'].astype(int) -rvar_capm['jdate'] = pd.to_datetime(rvar_capm['date']) + MonthEnd(0) -rvar_capm = rvar_capm[['permno', 'jdate', 'rvar_capm']] -rvar_capm = rvar_capm.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, rvar_capm, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/rvar_mean.pkl', 'rb') as f: - rvar_mean = pkl.load(f) - -rvar_mean['permno'] = rvar_mean['permno'].astype(int) -rvar_mean['jdate'] = pd.to_datetime(rvar_mean['date']) + MonthEnd(0) -rvar_mean = rvar_mean[['permno', 'jdate', 'rvar_mean']] -rvar_mean = rvar_mean.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, rvar_mean, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/rvar_ff3.pkl', 'rb') as f: - rvar_ff3 = pkl.load(f) - -rvar_ff3['permno'] = rvar_ff3['permno'].astype(int) -rvar_ff3['jdate'] = pd.to_datetime(rvar_ff3['date']) + MonthEnd(0) -rvar_ff3 = rvar_ff3[['permno', 'jdate', 'rvar_ff3']] -rvar_ff3 = rvar_ff3.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, rvar_ff3, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/sue.pkl', 'rb') as f: - sue = pkl.load(f) - -sue['permno'] = sue['permno'].astype(int) -sue['jdate'] = pd.to_datetime(sue['date']) + MonthEnd(0) -sue = sue[['permno', 'jdate', 'sue']] -sue = sue.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, sue, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/re.pkl', 'rb') as f: - re = pkl.load(f) - -re['permno'] = re['permno'].astype(int) -re['jdate'] = pd.to_datetime(re['date']) + MonthEnd(0) -re = re[['permno', 'jdate', 're']] -re = re.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, re, how='left', on=['permno', 'jdate']) - -with open('/home/jianxinma/chars/data/abr.pkl', 'rb') as f: - abr = pkl.load(f) - -abr['permno'] = abr['permno'].astype(int) -abr['jdate'] = pd.to_datetime(abr['date']) + MonthEnd(0) -abr = abr[['permno', 'jdate', 'abr']] -abr = abr.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, abr, how='left', on=['permno', 'jdate']) - -with open('baspread.pkl', 'rb') as f: - baspread = pkl.load(f) - -baspread['permno'] = baspread['permno'].astype(int) -baspread['jdate'] = pd.to_datetime(baspread['date']) + MonthEnd(0) -baspread = baspread[['permno', 'jdate', 'baspread']] -baspread = baspread.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, baspread, how='left', on=['permno', 'jdate']) - -with open('maxret.pkl', 'rb') as f: - maxret = pkl.load(f) - -maxret['permno'] = maxret['permno'].astype(int) -maxret['jdate'] = pd.to_datetime(maxret['date']) + MonthEnd(0) -maxret = maxret[['permno', 'jdate', 'maxret']] -maxret = maxret.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, maxret, how='left', on=['permno', 'jdate']) - -with open('std_dolvol.pkl', 'rb') as f: - std_dolvol = pkl.load(f) - -std_dolvol['permno'] = std_dolvol['permno'].astype(int) -std_dolvol['jdate'] = pd.to_datetime(std_dolvol['date']) + MonthEnd(0) -std_dolvol = std_dolvol[['permno', 'jdate', 'std_dolvol']] -std_dolvol = std_dolvol.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, std_dolvol, how='left', on=['permno', 'jdate']) - -with open('ill.pkl', 'rb') as f: - ill = pkl.load(f) - -ill['permno'] = ill['permno'].astype(int) -ill['jdate'] = pd.to_datetime(ill['date']) + MonthEnd(0) -ill = ill[['permno', 'jdate', 'ill']] -ill = ill.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, ill, how='left', on=['permno', 'jdate']) - -with open('std_turn.pkl', 'rb') as f: - std_turn = pkl.load(f) - -std_turn['permno'] = std_turn['permno'].astype(int) -std_turn['jdate'] = pd.to_datetime(std_turn['date']) + MonthEnd(0) -std_turn = std_turn[['permno', 'jdate', 'std_turn']] -std_turn = std_turn.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, std_turn, how='left', on=['permno', 'jdate']) - -with open('zerotrade.pkl', 'rb') as f: - zerotrade = pkl.load(f) - -zerotrade['permno'] = zerotrade['permno'].astype(int) -zerotrade['jdate'] = pd.to_datetime(zerotrade['date']) + MonthEnd(0) -zerotrade = zerotrade[['permno', 'jdate', 'zerotrade']] -zerotrade = zerotrade.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, zerotrade, how='left', on=['permno', 'jdate']) - -# save data -with open('chars_q_raw.pkl', 'wb') as f: - pkl.dump(chars_q, f, protocol=4) \ No newline at end of file diff --git a/char60/pkl_to_csv.py b/char60/pkl_to_csv.py deleted file mode 100755 index 74cefea..0000000 --- a/char60/pkl_to_csv.py +++ /dev/null @@ -1,29 +0,0 @@ -import pickle as pkl -import pandas as pd - -with open('/Users/eric/Downloads/chars_rank_60.pkl', 'rb') as f: - chars = pkl.load(f) - -print(chars.columns.values) - -chars['jdate'] = pd.to_datetime(chars['jdate']) -chars['year'] = chars['jdate'].dt.year -chars_1970s = chars[chars['year'] < 1980] -chars_1980s = chars[(chars['year'] >= 1980) & (chars['year'] < 1990)] -chars_1990s = chars[(chars['year'] >= 1990) & (chars['year'] < 2000)] -chars_2000s = chars[(chars['year'] >= 1990) & (chars['year'] < 2010)] -chars_2010s = chars[(chars['year'] >= 2000) & (chars['year'] < 2020)] - -# raw -# chars_1970s.to_csv('chars60_raw_1970s.csv', index=0) -# chars_1980s.to_csv('chars60_raw_1980s.csv', index=0) -# chars_1990s.to_csv('chars60_raw_1990s.csv', index=0) -# chars_2000s.to_csv('chars60_raw_2000s.csv', index=0) -# chars_2010s.to_csv('chars60_raw_2010s.csv', index=0) - -# rank -chars_1970s.to_csv('chars60_rank_1970s.csv', index=0) -chars_1980s.to_csv('chars60_rank_1980s.csv', index=0) -chars_1990s.to_csv('chars60_rank_1990s.csv', index=0) -chars_2000s.to_csv('chars60_rank_2000s.csv', index=0) -chars_2010s.to_csv('chars60_rank_2010s.csv', index=0) \ No newline at end of file diff --git a/char60/re.py b/char60/re.py deleted file mode 100755 index 7dab02f..0000000 --- a/char60/re.py +++ /dev/null @@ -1,120 +0,0 @@ -# Calculate HSZ Replicating Anomalies -# RE: Revisions in analysts’ earnings forecasts - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -from pandasql import * -import pickle as pkl - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -######################################################################### -# Merging IBES and CRSP by using ICLINK table. Merging last month price # -######################################################################### - -with open('iclink.pkl', 'rb')as f: - iclink = pkl.load(f) - -ibes = conn.raw_sql(""" - select - ticker, statpers, meanest, fpedats, anndats_act, curr_act, fpi, medest - from ibes.statsum_epsus - where - /* filtering IBES */ - statpers=0 - and CURCODE='USD' - and fpi in ('1','2')""") - -# filtering IBES -ibes = ibes[(ibes['medest'].notna()) & (ibes['fpedats'].notna())] -ibes = ibes[(ibes['curr_act']=='USD') | (ibes['curr_act'].isnull())] -ibes['statpers'] = pd.to_datetime(ibes['statpers']) -ibes['merge_date'] = ibes['statpers']+MonthEnd(0) - -crsp_msf = conn.raw_sql(""" - select permno, date, prc, cfacpr - from crsp.msf - """) - -crsp_msf['date'] = pd.to_datetime(crsp_msf['date']) -crsp_msf['date'] = crsp_msf['date']+MonthEnd(0) -crsp_msf['merge_date'] = crsp_msf['date']+MonthEnd(1) - -ibes_iclink = pd.merge(ibes, iclink, how='left', on='ticker') -ibes_crsp = pd.merge(ibes_iclink, crsp_msf, how='inner', on=['permno', 'merge_date']) -ibes_crsp.sort_values(by=['ticker', 'fpedats', 'statpers'], inplace=True) -ibes_crsp.reset_index(inplace=True, drop=True) - -############################### -# Merging last month forecast # -############################### -ibes_crsp['statpers_last_month'] = np.where((ibes_crsp['ticker'] == ibes_crsp['ticker'].shift(1)) & - (ibes_crsp['permno'] == ibes_crsp['permno'].shift(1)) & - (ibes_crsp['fpedats'] == ibes_crsp['fpedats'].shift(1)), - ibes_crsp['statpers'].shift(1).astype(str), np.nan) - -ibes_crsp['meanest_last_month'] = np.where((ibes_crsp['ticker'] == ibes_crsp['ticker'].shift(1)) & - (ibes_crsp['permno'] == ibes_crsp['permno'].shift(1)) & - (ibes_crsp['fpedats'] == ibes_crsp['fpedats'].shift(1)), - ibes_crsp['meanest'].shift(1), np.nan) - -ibes_crsp.sort_values(by=['ticker', 'permno', 'fpedats', 'statpers'], inplace=True) -ibes_crsp.reset_index(inplace=True, drop=True) - -########################### -# Drop empty "last month" # -# Calculate HXZ RE # -########################### - -ibes_crsp = ibes_crsp[ibes_crsp['statpers_last_month'].notna()] -ibes_crsp['prc_adj'] = ibes_crsp['prc']/ibes_crsp['cfacpr'] -ibes_crsp = ibes_crsp[ibes_crsp['prc_adj']>0] -ibes_crsp['monthly_revision'] = (ibes_crsp['meanest'] - ibes_crsp['meanest_last_month'])/ibes_crsp['prc_adj'] - -ibes_crsp['permno'] = ibes_crsp['permno'].astype(int) -ibes_crsp['permno'] = ibes_crsp['permno'].astype(str) -ibes_crsp['fpedats'] = ibes_crsp['fpedats'].astype(str) -ibes_crsp['permno_fpedats'] = ibes_crsp['permno'].str.cat(ibes_crsp['fpedats'], sep='-') - -ibes_crsp = ibes_crsp.drop_duplicates(['permno_fpedats', 'statpers']) -ibes_crsp['count'] = ibes_crsp.groupby('permno_fpedats').cumcount() + 1 - -######################## -# Calculate RE (CJL) # -######################## - -ibes_crsp['monthly_revision_l1'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(1) -ibes_crsp['monthly_revision_l2'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(2) -ibes_crsp['monthly_revision_l3'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(3) -ibes_crsp['monthly_revision_l4'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(4) -ibes_crsp['monthly_revision_l5'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(5) -ibes_crsp['monthly_revision_l6'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(6) - -condlist = [ibes_crsp['count']==4, - ibes_crsp['count']==5, - ibes_crsp['count']==6, - ibes_crsp['count']>=7] -choicelist = [(ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'])/3, - (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'])/4, - (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'] + ibes_crsp['monthly_revision_l5'])/5, - (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'] + ibes_crsp['monthly_revision_l5'] + ibes_crsp['monthly_revision_l6'])/6] -ibes_crsp['re'] = np.select(condlist, choicelist, default=np.nan) - -ibes_crsp = ibes_crsp[ibes_crsp['count']>=4] -ibes_crsp = ibes_crsp.sort_values(by=['ticker', 'statpers', 'fpedats']) -ibes_crsp = ibes_crsp.drop_duplicates(['ticker', 'statpers']) - -ibes_crsp = ibes_crsp[['ticker', 'statpers', 'fpedats', 'anndats_act', 'curr_act', 'permno', 're']] -ibes_crsp.rename(columns={'statpers': 'date'}, inplace=True) - -with open('re.pkl', 'wb') as f: - pkl.dump(ibes_crsp, f) \ No newline at end of file diff --git a/char60/rvar_capm.py b/char60/rvar_capm.py deleted file mode 100755 index fa3a01c..0000000 --- a/char60/rvar_capm.py +++ /dev/null @@ -1,168 +0,0 @@ -# CAPM residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf - from crsp.dsf as a - left join ff.factors_daily as b - on a.date=b.date - where a.date >= '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate residual # -###################### - - -def get_res_var(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - # if observations in last 3 months are less 21, we drop the rvar of this month - if temp['permno'].count() < 21: - pass - else: - rolling_window = temp['permno'].count() - index = temp.tail(1).index - X = pd.DataFrame() - X[['mktrf']] = temp[['mktrf']] - X['intercept'] = 1 - X = X[['intercept', 'mktrf']] - X = np.mat(X) - Y = np.mat(temp[['exret']]) - res = (np.identity(rolling_window) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) - res_var = res.var(ddof=1) - df.loc[index, 'rvar'] = res_var - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_res_var, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling -crsp = crsp.rename(columns={'rvar': 'rvar_capm'}) -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'rvar_capm']] - -with open('rvar_capm.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/rvar_ff3.py b/char60/rvar_ff3.py deleted file mode 100755 index 36561a0..0000000 --- a/char60/rvar_ff3.py +++ /dev/null @@ -1,201 +0,0 @@ -# Fama & French 3 factors residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf, b.smb, b.hml - from crsp.dsf as a - left join ff.factors_daily as b - on a.date=b.date - where a.date > '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate the beta # -###################### -# function that get multiple beta -'''' -rolling_window = 60 # 60 trading days -crsp['beta_mktrf'] = np.nan -crsp['beta_smb'] = np.nan -crsp['beta_hml'] = np.nan - - -def get_beta(df): - """ - The original idea of calculate beta is using formula (X'MX)^(-1)X'MY, - where M = I - 1(1'1)^{-1}1, I is a identity matrix. - - """ - temp = crsp.loc[df.index] # extract the rolling sub dataframe from original dataframe - X = np.mat(temp[['mktrf', 'smb', 'hml']]) - Y = np.mat(temp[['exret']]) - ones = np.mat(np.ones(rolling_window)).T - M = np.identity(rolling_window) - ones.dot((ones.T.dot(ones)).I).dot(ones.T) - beta = (X.T.dot(M).dot(X)).I.dot((X.T.dot(M).dot(Y))) - crsp['beta_mktrf'].loc[df.index[-1:]] = beta[0] - crsp['beta_smb'].loc[df.index[-1:]] = beta[1] - crsp['beta_hml'].loc[df.index[-1:]] = beta[2] - return 0 # we do not need the rolling outcome since rolling cannot return different values in different columns - - -# calculate beta through rolling window -crsp_temp = crsp.groupby('permno').rolling(rolling_window).apply(get_beta, raw=False) -''' - -###################### -# Calculate residual # -###################### - - -def get_res_var(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - # if observations in last 3 months are less 21, we drop the rvar of this month - if temp['permno'].count() < 21: - pass - else: - rolling_window = temp['permno'].count() - index = temp.tail(1).index - X = pd.DataFrame() - X[['mktrf', 'smb', 'hml']] = temp[['mktrf', 'smb', 'hml']] - X['intercept'] = 1 - X = X[['intercept', 'mktrf', 'smb', 'hml']] - X = np.mat(X) - Y = np.mat(temp[['exret']]) - res = (np.identity(rolling_window) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) - res_var = res.var(ddof=1) - df.loc[index, 'rvar'] = res_var - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_res_var, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling -crsp = crsp.rename(columns={'rvar': 'rvar_ff3'}) -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'rvar_ff3']] - -with open('rvar_ff3.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/rvar_mean.py b/char60/rvar_mean.py deleted file mode 100755 index 42297f4..0000000 --- a/char60/rvar_mean.py +++ /dev/null @@ -1,150 +0,0 @@ -# RVAR mean - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select permno, date, ret - from crsp.dsf - where date >= '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate variance # -###################### - - -def get_ret_var(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - # if observations in last 3 months are less 21, we drop the rvar of this month - if temp['permno'].count() < 21: - pass - else: - index = temp.tail(1).index - ret_var = temp['ret'].var() - df.loc[index, 'rvar'] = ret_var - return df - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_ret_var, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling -crsp = crsp.rename(columns={'rvar': 'rvar_mean'}) -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'rvar_mean']] - -with open('rvar_mean.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/std_dolvol.py b/char60/std_dolvol.py deleted file mode 100755 index fc3c2ef..0000000 --- a/char60/std_dolvol.py +++ /dev/null @@ -1,158 +0,0 @@ -# Fama & French 3 factors residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.vol, a.prc - from crsp.dsf as a - where a.date > '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate residual # -###################### - - -def get_baspread(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - if temp['permno'].count() < 21: - pass - else: - index = temp.tail(1).index - X = pd.DataFrame() - X[['prc', 'vol']] = temp[['prc', 'vol']] - std_dolvol = np.log(abs((X['vol']*X['prc']))).replace([np.inf, -np.inf], np.nan).std() - df.loc[index, 'std_dolvol'] = std_dolvol - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['std_dolvol']) # drop NA due to rolling -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'std_dolvol']] - -with open('std_dolvol.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/std_turn.py b/char60/std_turn.py deleted file mode 100755 index c5d30ec..0000000 --- a/char60/std_turn.py +++ /dev/null @@ -1,158 +0,0 @@ -# Fama & French 3 factors residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.vol, a.shrout - from crsp.dsf as a - where a.date > '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate residual # -###################### - - -def get_baspread(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - if temp['permno'].count() < 21: - pass - else: - index = temp.tail(1).index - X = pd.DataFrame() - X[['vol', 'shrout']] = temp[['vol', 'shrout']] - std_turn = (X['vol'] / X['shrout']).std() - df.loc[index, 'std_turn'] = std_turn - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['std_turn']) # drop NA due to rolling -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'std_turn']] - -with open('std_turn.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/sue.py b/char60/sue.py deleted file mode 100755 index 8238cdb..0000000 --- a/char60/sue.py +++ /dev/null @@ -1,106 +0,0 @@ -# Calculate HSZ Replicating Anomalies -# SUE: Standardized Unexpected Earnings (Earnings surprise) - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -from pandasql import * -import pickle as pkl - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -################### -# Compustat Block # -################### -comp = conn.raw_sql(""" - select gvkey, datadate, fyearq, fqtr, epspxq, ajexq - from comp.fundq - where indfmt = 'INDL' - and datafmt = 'STD' - and popsrc = 'D' - and consol = 'C' - and datadate >= '01/01/1959' - """) - -comp['datadate'] = pd.to_datetime(comp['datadate']) - -################### -# CCM Block # -################### -ccm = conn.raw_sql(""" - select gvkey, lpermno as permno, linktype, linkprim, - linkdt, linkenddt - from crsp.ccmxpf_linktable - where linktype in ('LU', 'LC') - """) - -ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) -ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) -# if linkenddt is missing then set to today date -ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) - -ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) - -# set link date bounds -ccm2 = ccm1[(ccm1['datadate']>=ccm1['linkdt']) & (ccm1['datadate']<=ccm1['linkenddt'])] -ccm2 = ccm2[['gvkey', 'permno', 'datadate', 'fyearq', 'fqtr', 'epspxq', 'ajexq']] - -# the time series of exspxq/ajexq -ccm2['eps'] = ccm2['epspxq']/ccm2['ajexq'] -ccm2.drop_duplicates(['permno', 'datadate'], inplace=True) - -# merge lag1 to lag9, then calculate stand deviation -ccm2 = ccm2[ccm2['eps'].notna()] -ccm2['count'] = ccm2.groupby('permno').cumcount() + 1 -ccm2.sort_values(by=['permno', 'datadate'], inplace=True) - -ccm2['e1'] = ccm2.groupby(['permno'])['eps'].shift(1) -ccm2['e2'] = ccm2.groupby(['permno'])['eps'].shift(2) -ccm2['e3'] = ccm2.groupby(['permno'])['eps'].shift(3) -ccm2['e4'] = ccm2.groupby(['permno'])['eps'].shift(4) -ccm2['e5'] = ccm2.groupby(['permno'])['eps'].shift(5) -ccm2['e6'] = ccm2.groupby(['permno'])['eps'].shift(6) -ccm2['e7'] = ccm2.groupby(['permno'])['eps'].shift(7) -ccm2['e8'] = ccm2.groupby(['permno'])['eps'].shift(8) - -condlist = [ccm2['count']<=6, - ccm2['count']==7, - ccm2['count']==8, - ccm2['count']>=9] -choicelist = [np.nan, - ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3']].std(axis=1), - ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3', 'e2']].std(axis=1), - ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3', 'e2', 'e1']].std(axis=1)] -ccm2['sue_std'] = np.select(condlist, choicelist, default=np.nan) - -ccm2['sue'] = (ccm2['eps'] - ccm2['e4'])/ccm2['sue_std'] - -# populate the quarterly sue to monthly -crsp_msf = conn.raw_sql(""" - select distinct date - from crsp.msf - where date >= '01/01/1959' - """) - -ccm2['datadate'] = pd.to_datetime(ccm2['datadate']) -ccm2['plus12m'] = ccm2['datadate'] + np.timedelta64(12, 'M') -ccm2['plus12m'] = ccm2['plus12m'] + MonthEnd(0) - -df = sqldf("""select a.*, b.date - from ccm2 a left join crsp_msf b - on a.datadate <= b.date - and a.plus12m >= b.date - order by a.permno, b.date, a.datadate desc;""", globals()) - -df = df.drop_duplicates(['permno', 'date']) -df['datadate'] = pd.to_datetime(df['datadate']) -df = df[['gvkey', 'permno', 'datadate', 'date', 'sue']] - -with open('sue.pkl', 'wb') as f: - pkl.dump(df, f) \ No newline at end of file diff --git a/char60/zerotrade.py b/char60/zerotrade.py deleted file mode 100755 index 8f10d4f..0000000 --- a/char60/zerotrade.py +++ /dev/null @@ -1,161 +0,0 @@ -# Fama & French 3 factors residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.vol, a.shrout - from crsp.dsf as a - where a.date > '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate residual # -###################### - - -def get_baspread(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - if temp['permno'].count() < 21: - pass - else: - index = temp.tail(1).index - X = pd.DataFrame() - X[['vol', 'shrout']] = temp[['vol', 'shrout']] - X['countzero'] = np.where(X['vol'] == 0, 1, 0) - X['turn'] = (X['vol'] / X['shrout']) - X['turn'] = np.where(X['turn'] == 0, np.inf, X['turn']) - zerotrade = (X['countzero']+((1/X['turn'])/480000))*21/X['vol'].count() - df.loc[index, 'zerotrade'] = zerotrade - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['zerotrade']) # drop NA due to rolling -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'zerotrade']] - -with open('zerotrade.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/py-dgtw/.DS_Store b/py-dgtw/.DS_Store deleted file mode 100755 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0= '01/01/1970' - """) - -comp['datadate']=pd.to_datetime(comp['datadate']) #convert datadate to date fmt -comp['year']=comp['datadate'].dt.year - -comp = comp[comp['seq']>0] - -# create preferrerd stock: -# 1st choice: Preferred stock - Redemption Value -# 2nd choice: Preferred stock - Liquidating Value -# 3rd choice: Preferred stock - Carrying Value, Stock (Capital) - Total -comp['pref']=np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv']) -comp['pref']=np.where(comp['pref'].isnull(),comp['pstk'], comp['pref']) -comp['pref']=np.where(comp['pref'].isnull(),0,comp['pref']) - -# fill in missing values for deferred taxes and investment tax credit -comp['txdb']=comp['txdb'].fillna(0) -comp['itcb']=comp['itcb'].fillna(0) - -# create book equity -# Daniel and Titman (JF 1997): -# BE = stockholders' equity + deferred taxes + investment tax credit - Preferred Stock -comp['be']=comp['seq']+comp['txdb']+comp['itcb']-comp['pref'] - -# keep only records with non-negative book equity -comp = comp[comp['be']>=0] -comp=comp[['gvkey','datadate','year','be','sich']] - - -######################### -# Add Historical PERMCO # -######################### -ccm=conn.raw_sql(""" - select gvkey, lpermco as permco, linktype, linkprim, - linkdt, linkenddt - from crsp.ccmxpf_linktable - where (linktype ='LU' or linktype='LC') - """) - -ccm['linkdt']=pd.to_datetime(ccm['linkdt']) -ccm['linkenddt']=pd.to_datetime(ccm['linkenddt']) -# if linkenddt is missing then set to today date -ccm['linkenddt']=ccm['linkenddt'].fillna(pd.to_datetime('today')) - -ccm1=pd.merge(comp,ccm,how='left',on=['gvkey']) -ccm1['jdate']=ccm1['datadate']+MonthEnd(0) -ccm1['year']=ccm1.datadate.dt.year - -# set link date bounds -comp2=ccm1[(ccm1['datadate']>=ccm1['linkdt'])&(ccm1['datadate']<=ccm1['linkenddt'])] -comp2=comp2[['gvkey','permco','datadate', 'year','jdate', 'be', 'sich', 'linkprim']] - - -# link comp and crsp to calculate book-to-market ratio each fiscal year end -comp3=pd.merge(comp2, crsp_m[['permno','permco','date','jdate','siccd','me','me_comp']],\ - how='inner', on=['permco', 'jdate']) -comp3['bm']=comp3['be'].div(comp3['me_comp']) - -comp3 = comp3.sort_values(['permno', 'year', 'datadate', 'linkprim', 'bm'])\ - .drop_duplicates() - -# pick max datadate for a given permno year combo (firm changes fiscal period) -maxdatadate=comp3.groupby(['permno','year'])['datadate'].max()\ - .reset_index() - -comp3 = pd.merge(comp3, maxdatadate, how='inner', on=['permno','year','datadate']) - -######################### -# Assign Fama-French 48 # -######################### - -# function to assign ffi48 classification -def ffi48(row): - if (100<=row['sic'] <=299) or (700<=row['sic']<=799) or (910<=row['sic']<=919) or (row['sic']==2048): - ffi48=1 - ffi48_desc='Agric' - elif (2000<=row['sic']<=2046) or (2050<=row['sic']<=2063) or (2070<=row['sic']<=2079)\ - or (2090<=row['sic']<=2092) or (row['sic']==2095) or (2098<=row['sic']<=2099): - ffi48=2 - ffi48_desc='Food' - elif (2064<=row['sic']<=2068) or (2086<=row['sic']<=2087) or (2096<=row['sic']<=2097): - ffi48=3 - ffi48_desc='Soda' - elif (row['sic']==2080) or (2082<=row['sic']<=2085): - ffi48=4 - ffi48_desc='Beer' - elif (2100<=row['sic']<=2199): - ffi48=5 - ffi48_desc='Smoke' - elif (920<=row['sic']<=999) or (3650<=row['sic']<=3652) or (row['sic']==3732) or (3930<=row['sic']<=3931) or (3940<=row['sic']<=3949): - ffi48=6 - ffi48_desc='Toys' - elif (7800<=row['sic']<=7833) or (7840<=row['sic']<=7841) or(row['sic']==7900)or (7910<=row['sic']<=7911) or (7920<=row['sic']<=7933)\ - or (7940<=row['sic']<=7949) or (row['sic']==7980) or (7990<=row['sic']<=7999): - ffi48=7 - ffi48_desc='Fun' - elif (2700<=row['sic']<=2749) or (2770<=row['sic']<=2771) or (2780<=row['sic']<=2799): - ffi48=8 - ffi48_desc='Books' - elif (row['sic']==2047) or (2391<=row['sic']<=2392) or (2510<=row['sic']<=2519) or (2590<=row['sic']<=2599) or (2840<=row['sic']<=2844)\ - or (3160<=row['sic']<=3161) or (3170<=row['sic']<=3172) or (3190<=row['sic']<=3199) or (row['sic']==3229) or (row['sic']==3260)\ - or (3262<=row['sic']<=3263) or (row['sic']==3269) or (3230<=row['sic']<=3231) or(3630<=row['sic']<=3639) or (3750<=row['sic']<=3751)\ - or (row['sic']==3800) or (3860<=row['sic']<=3861) or (3870<=row['sic']<=3873) or (3910<=row['sic']<=3911) or (3914<=row['sic']<=3915)\ - or (3960<=row['sic']<=3962) or (row['sic']==3991) or (row['sic']==3995): - ffi48=9 - ffi48_desc='Hshld' - elif (2300<=row['sic']<=2390) or (3020<=row['sic']<=3021) or (3100<=row['sic']<=3111)\ - or (3130<=row['sic']<=3131) or (3140<=row['sic']<=3151) or (3963<=row['sic']<=3965): - ffi48=10 - ffi48_desc='Clths' - elif (8000<=row['sic']<=8099): - ffi48=11 - ffi48_desc='Hlth' - elif (row['sic']==3693) or (3840<=row['sic']<=3851): - ffi48=12 - ffi48_desc='MedEq' - elif (2830<=row['sic']<=2831) or (2833<=row['sic']<=2836): - ffi48=13 - ffi48_desc='Drugs' - elif (2800<=row['sic']<=2829) or (2850<=row['sic']<=2879) or (2890<=row['sic']<=2899): - ffi48=14 - ffi48_desc='Chems' - elif (row['sic']==3031) or (row['sic']==3041) or (3050<=row['sic']<=3053) or (3060<=row['sic']<=3069) or (3070<=row['sic']<=3099): - ffi48=15 - ffi48_desc='Rubbr' - elif (2200<=row['sic']<=2284) or (2290<=row['sic']<=2295) or (2297<=row['sic']<=2299) or (2393<=row['sic']<=2395) or (2397<=row['sic']<=2399): - ffi48=16 - ffi48_desc='Txtls' - elif (800<=row['sic']<=899) or (2400<=row['sic']<=2439) or (2450<=row['sic']<=2459) or (2490<=row['sic']<=2499) or (2660<=row['sic']<=2661)\ - or (2950<=row['sic']<=2952) or (row['sic']==3200) or (3210<=row['sic']<=3211) or (3240<=row['sic']<=3241) or (3250<=row['sic']<=3259)\ - or (row['sic']==3261) or (row['sic']==3264) or (3270<=row['sic']<=3275) or (3280<=row['sic']<=3281) or (3290<=row['sic']<=3293)\ - or (3295<=row['sic']<=3299) or (3420<=row['sic']<=3433) or (3440<=row['sic']<=3442) or (row['sic']==3446) or (3448<=row['sic']<=3452)\ - or (3490<=row['sic']<=3499) or (row['sic']==3996): - ffi48=17 - ffi48_desc='BldMt' - elif (1500<=row['sic']<=1511) or (1520<=row['sic']<=1549) or (1600<=row['sic']<=1799): - ffi48=18 - ffi48_desc='Cnstr' - elif (row['sic']==3300) or (3310<=row['sic']<=3317) or (3320<=row['sic']<=3325) or (3330<=row['sic']<=3341) or(3350<=row['sic']<=3357)\ - or (3360<=row['sic']<=3379) or (3390<=row['sic']<=3399): - ffi48=19 - ffi48_desc='Steel' - elif (row['sic']==3400) or (3443<=row['sic']<=3444) or (3460<=row['sic']<=3479): - ffi48=20 - ffi48_desc='FabPr' - elif (3510<=row['sic']<=3536) or (row['sic']==3538) or (3540<=row['sic']<=3569)\ - or (3580<=row['sic']<=3582) or (3585<=row['sic']<=3586) or (3589<=row['sic']<=3599): - ffi48=21 - ffi48_desc='Mach' - elif (row['sic']==3600) or (3610<=row['sic']<=3613) or (3620<=row['sic']<=3621) or (3623<=row['sic']<=3629) or (3640<=row['sic']<=3646)\ - or (3648<=row['sic']<=3649) or (row['sic']==3660) or (3690<=row['sic']<=3692) or (row['sic']==3699): - ffi48=22 - ffi48_desc='ElcEq' - elif (row['sic']==2296) or (row['sic']==2396) or (3010<=row['sic']<=3011) or (row['sic']==3537) or (row['sic']==3647) or (row['sic']==3694)\ - or (row['sic']==3700) or (3710<=row['sic']<=3711) or (3713<=row['sic']<=3716) or (3790<=row['sic']<=3792) or (row['sic']==3799): - ffi48=23 - ffi48_desc='Autos' - elif (3720<=row['sic']<=3721) or (3723<=row['sic']<=3725) or (3728<=row['sic']<=3729): - ffi48=24 - ffi48_desc='Aero' - elif (3730<=row['sic']<=3731) or (3740<=row['sic']<=3743): - ffi48=25 - ffi48_desc='Ships' - elif (3760<=row['sic']<=3769) or (row['sic']==3795) or (3480<=row['sic']<=3489): - ffi48=26 - ffi48_desc='Guns' - elif (1040<=row['sic']<=1049): - ffi48=27 - ffi48_desc='Gold' - elif (1000<=row['sic']<=1039) or (1050<=row['sic']<=1119) or (1400<=row['sic']<=1499): - ffi48=28 - ffi48_desc='Mines' - elif (1200<=row['sic']<=1299): - ffi48=29 - ffi48_desc='Coal' - elif (row['sic']==1300) or (1310<=row['sic']<=1339) or (1370<=row['sic']<=1382) or (row['sic']==1389) or (2900<=row['sic']<=2912) or (2990<=row['sic']<=2999): - ffi48=30 - ffi48_desc='Oil' - elif (row['sic']==4900) or (4910<=row['sic']<=4911) or (4920<=row['sic']<=4925) or (4930<=row['sic']<=4932) or (4939<=row['sic']<=4942): - ffi48=31 - ffi48_desc='Util' - elif (row['sic']==4800) or (4810<=row['sic']<=4813) or (4820<=row['sic']<=4822) or (4830<=row['sic']<=4841) or (4880<=row['sic']<=4892) or (row['sic']==4899): - ffi48=32 - ffi48_desc='Telcm' - elif (7020<=row['sic']<=7021) or (7030<=row['sic']<=7033) or (row['sic']==7200) or (7210<=row['sic']<=7212) or (7214<=row['sic']<=7217)\ - or (7219<=row['sic']<=7221) or (7230<=row['sic']<=7231) or (7240<=row['sic']<=7241) or (7250<=row['sic']<=7251) or (7260<=row['sic']<=7299)\ - or (row['sic']==7395) or (row['sic']==7500) or (7520<=row['sic']<=7549) or (row['sic']==7600) or (row['sic']==7620)\ - or (7622<=row['sic']<=7623) or (7629<=row['sic']<=7631) or (7640<=row['sic']<=7641) or (7690<=row['sic']<=7699) or (8100<=row['sic']<=8499)\ - or (8600<=row['sic']<=8699) or (8800<=row['sic']<=8899) or (7510<=row['sic']<=7515): - ffi48=33 - ffi48_desc='PerSv' - elif (2750<=row['sic']<=2759) or (row['sic']==3993) or (row['sic']==7218) or (row['sic']==7300) or (7310<=row['sic']<=7342)\ - or (7349<=row['sic']<=7353) or (7359<=row['sic']<=7372) or (7374<=row['sic']<=7385) or (7389<=row['sic']<=7394) or (7396<=row['sic']<=7397)\ - or (row['sic']==7399) or (row['sic']==7519) or (row['sic']==8700) or (8710<=row['sic']<=8713) or (8720<=row['sic']<=8721) \ - or (8730<=row['sic']<=8734) or (8740<=row['sic']<=8748) or (8900<=row['sic']<=8911) or (8920<=row['sic']<=8999) or (4220<=row['sic']<=4229): - ffi48=34 - ffi48_desc='BusSv' - elif (3570<=row['sic']<=3579) or (3680<=row['sic']<=3689) or (row['sic']==3695) or (row['sic']==7373): - ffi48=35 - ffi48_desc='Comps' - elif (row['sic']==3622) or (3661<=row['sic']<=3666) or (3669<=row['sic']<=3679) or (row['sic']==3810) or (row['sic']==3812): - ffi48=36 - ffi48_desc='Chips' - elif (row['sic']==3811) or (3820<=row['sic']<=3827) or (3829<=row['sic']<=3839): - ffi48=37 - ffi48_desc='LabEq' - elif (2520<=row['sic']<=2549) or (2600<=row['sic']<=2639) or (2670<=row['sic']<=2699) or (2760<=row['sic']<=2761) or (3950<=row['sic']<=3955): - ffi48=38 - ffi48_desc='Paper' - elif (2440<=row['sic']<=2449) or (2640<=row['sic']<=2659) or (3220<=row['sic']<=3221) or (3410<=row['sic']<=3412): - ffi48=39 - ffi48_desc='Boxes' - elif (4000<=row['sic']<=4013) or (4040<=row['sic']<=4049) or (row['sic']==4100) or (4110<=row['sic']<=4121) or (4130<=row['sic']<=4131)\ - or (4140<=row['sic']<=4142) or (4150<=row['sic']<=4151) or (4170<=row['sic']<=4173) or (4190<=row['sic']<=4200)\ - or (4210<=row['sic']<=4219) or (4230<=row['sic']<=4231) or (4240<=row['sic']<=4249) or (4400<=row['sic']<=4700) or (4710<=row['sic']<=4712)\ - or (4720<=row['sic']<=4749) or (row['sic']==4780) or (4782<=row['sic']<=4785) or (row['sic']==4789): - ffi48=40 - ffi48_desc='Trans' - elif (row['sic']==5000) or (5010<=row['sic']<=5015) or (5020<=row['sic']<=5023) or (5030<=row['sic']<=5060) or (5063<=row['sic']<=5065)\ - or (5070<=row['sic']<=5078) or (5080<=row['sic']<=5088) or (5090<=row['sic']<=5094) or (5099<=row['sic']<=5100)\ - or (5110<=row['sic']<=5113) or (5120<=row['sic']<=5122) or (5130<=row['sic']<=5172) or (5180<=row['sic']<=5182) or (5190<=row['sic']<=5199): - ffi48=41 - ffi48_desc='Whlsl' - elif (row['sic']==5200) or (5210<=row['sic']<=5231) or (5250<=row['sic']<=5251) or (5260<=row['sic']<=5261) or (5270<=row['sic']<=5271)\ - or (row['sic']==5300) or (5310<=row['sic']<=5311) or (row['sic']==5320) or (5330<=row['sic']<=5331) or (row['sic']==5334)\ - or (5340<=row['sic']<=5349) or (5390<=row['sic']<=5400) or (5410<=row['sic']<=5412) or (5420<=row['sic']<=5469) or (5490<=row['sic']<=5500)\ - or (5510<=row['sic']<=5579) or (5590<=row['sic']<=5700) or (5710<=row['sic']<=5722) or (5730<=row['sic']<=5736) or (5750<=row['sic']<=5799)\ - or (row['sic']==5900) or (5910<=row['sic']<=5912) or (5920<=row['sic']<=5932) or (5940<=row['sic']<=5990) or (5992<=row['sic']<=5995) or (row['sic']==5999): - ffi48=42 - ffi48_desc='Rtail' - elif (5800<=row['sic']<=5829) or (5890<=row['sic']<=5899) or (row['sic']==7000) or (7010<=row['sic']<=7019) or (7040<=row['sic']<=7049) or (row['sic']==7213): - ffi48=43 - ffi48_desc='Meals' - elif (row['sic']==6000) or (6010<=row['sic']<=6036) or (6040<=row['sic']<=6062) or (6080<=row['sic']<=6082) or (6090<=row['sic']<=6100)\ - or (6110<=row['sic']<=6113) or (6120<=row['sic']<=6179) or (6190<=row['sic']<=6199): - ffi48=44 - ffi48_desc='Banks' - elif (row['sic']==6300) or (6310<=row['sic']<=6331) or (6350<=row['sic']<=6351) or (6360<=row['sic']<=6361) or (6370<=row['sic']<=6379) or (6390<=row['sic']<=6411): - ffi48=45 - ffi48_desc='Insur' - elif (row['sic']==6500) or (row['sic']==6510) or (6512<=row['sic']<=6515) or (6517<=row['sic']<=6532) or (6540<=row['sic']<=6541)\ - or (6550<=row['sic']<=6553) or (6590<=row['sic']<=6599) or (6610<=row['sic']<=6611): - ffi48=46 - ffi48_desc='RlEst' - elif (6200<=row['sic']<=6299) or (row['sic']==6700) or (6710<=row['sic']<=6726) or (6730<=row['sic']<=6733) or (6740<=row['sic']<=6779)\ - or (6790<=row['sic']<=6795) or (6798<=row['sic']<=6799): - ffi48=47 - ffi48_desc='Fin' - elif (4950<=row['sic']<=4961) or (4970<=row['sic']<=4971) or (4990<=row['sic']<=4991) or (row['sic']==9999): - ffi48=48 - ffi48_desc='Other' - else: - ffi48=np.nan - ffi48_desc='' - return pd.Series({'sic': row['sic'], 'ffi48': ffi48, 'ffi48_desc': ffi48_desc}) - -# assign SIC code -comp4 = comp3 -# First use historical Compustat SIC Code -# Then if missing use historical CRSP SIC Code -comp4['sic']=np.where(comp4['sich']>0, comp4['sich'], comp4['siccd']) - -# and adjust some SIC code to fit F&F 48 ind delineation -comp4['sic']=np.where((comp4['sic'].isin([3990, 9995, 9997])) & (comp4['siccd']>0) & (comp4['sic'] != comp4['siccd']), \ - comp4['siccd'], comp4['sic']) -comp4['sic']=np.where(comp4['sic'].isin([3990,3999]), 3991, comp4['sic']) -comp4['sic']=comp4.sic.astype(int) - -# assign the ffi48 function to comp4 -_sic = comp4['sic'].unique() -_sicff = pd.DataFrame(_sic).rename(columns={0:'sic'}) -_sicff = _sicff.apply(ffi48, axis=1) -comp4 = pd.merge(comp4, _sicff, how='left', on=['sic']) - -# keep only records with non-missing bm and ffi48 classification -comp4 = comp4[(comp4['bm'] != np.NaN) & (comp4['ffi48_desc'] !='')] -comp4 = comp4.drop(['sich','siccd','datadate'], axis=1) -comp4=comp4.sort_values(['ffi48','year']) - - -######################### -# Industry BM Average # -######################### - -# Calculate BM Industry Average Each Period -comp4_tmp = comp4[(comp4['ffi48']>0)&(comp4['bm']>=0)] -bm_ind = comp4_tmp.groupby(['ffi48','year'])['bm'].mean().reset_index().rename(columns={'bm':'bmind'}) - -# Calculate Long-Term Industry BtM Average -bm_ind['n'] = bm_ind.groupby(['ffi48'])['year'].cumcount() -bm_ind['sumbm']=bm_ind.groupby(['ffi48'])['bmind'].cumsum() -bm_ind['bmavg'] = bm_ind['sumbm']/(bm_ind['n']+1) -bm_ind = bm_ind.drop(['n','sumbm'], axis=1) - -# Adjust Firm-Specific BtM with Industry Averages -comp5 = pd.merge(comp4, bm_ind, how='left',on=['ffi48','year']) -comp5['bm_adj'] = comp5['bm']-comp5['bmavg'] - - -######################### -# Momentum Factor # -######################### - -# Create (12,1) Momentum Factor with at least 6 months of returns -_tmp_crsp = crsp_m[['permno','date','ret', 'me', 'exchcd']].sort_values(['permno','date']).set_index('date') -#replace missing return with 0 -_tmp_crsp['ret']=_tmp_crsp['ret'].fillna(0) -_tmp_crsp['logret']=np.log(1+_tmp_crsp['ret']) -_tmp_cumret = _tmp_crsp.groupby(['permno'])['logret'].rolling(12, min_periods=7).sum() -_tmp_cumret = _tmp_cumret.reset_index() -_tmp_cumret['cumret']=np.exp(_tmp_cumret['logret'])-1 - -sizemom = pd.merge(_tmp_crsp.reset_index(), _tmp_cumret[['permno','date','cumret']], how='left', on=['permno','date']) -sizemom['mom']=sizemom.groupby('permno')['cumret'].shift(1) -sizemom=sizemom[sizemom['date'].dt.month==6].drop(['logret','cumret'], axis=1).rename(columns={'me':'size'}) - - -######################### -# NYSE Size Breakpoint # -######################### - -# Get Size Breakpoints for NYSE firms -sizemom=sizemom.sort_values(['date','permno']).drop_duplicates() -nyse = sizemom[sizemom['exchcd']==1] -nyse_break = nyse.groupby(['date'])['size'].describe(percentiles=[.2,.4,.6,.8]).reset_index() -nyse_break = nyse_break[['date','20%','40%','60%','80%']]\ -.rename(columns={'20%':'dec20', '40%':'dec40', '60%':'dec60','80%':'dec80'}) - -sizemom = pd.merge(sizemom, nyse_break, how='left', on='date') - -# Add NYSE Size Breakpoints to the Data -def size_group(row): - if 0<=row['size'] < row['dec20']: - value = 1 - elif row['size'] < row['dec40']: - value=2 - elif row['size'] < row['dec60']: - value=3 - elif row['size'] < row['dec80']: - value=4 - elif row['size'] >= row['dec80']: - value=5 - else: - value=np.nan - return value - -sizemom['group']=sizemom.apply(size_group, axis=1) -sizemom['year']=sizemom['date'].dt.year-1 -sizemom=sizemom[['permno','date','year','mom','group','size','ret']] - -# Adjusted BtM from the calendar year preceding the formation date -comp6=comp5[['gvkey','permno','year','bm_adj']] -comp6=pd.merge(comp6, sizemom, how='inner', on=['permno','year']) -comp6=comp6.dropna(subset=['size','mom','bm_adj','ret'], how='any') - -######################### -# Size BM MOM Portfolio # -######################### - -# Start the Triple Sort on Size, Book-to-Market, and Momentum -port1=comp6.sort_values(['date','group','permno']).drop_duplicates() -port1['bmr']=port1.groupby(['date','group'])['bm_adj'].transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop')) -port2 = port1.sort_values(['date','group','bmr']) -port2['momr']=port2.groupby(['date','group','bmr'])['mom'].transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop')) - -# DGTW_PORT 1 for Bottom Quintile, 5 for Top Quintile -port3=port2 -port3['bmr']=port3['bmr']+1 -port3['momr']=port3['momr']+1 -port3[['group','bmr','momr']]=port3[['group','bmr','momr']].astype(int).astype(str) -port3['dgtw_port']=port3['group']+port3['bmr']+port3['momr'] -port4 = port3[['permno','gvkey','date','size','mom','bm_adj','dgtw_port']] -port4['date']=port4['date']+MonthEnd(0) -port4['jyear']=port4['date'].dt.year -port4=port4.sort_values(['permno','date']) -port4=port4.rename(columns={'date':'formdate', 'size':'sizew'}) -port4=port4[['permno','formdate','jyear','sizew','dgtw_port']] - -crsp_m1= crsp_m[['permno','date','ret']] -crsp_m1['date']=crsp_m1['date']+MonthEnd(0) -crsp_m1['jdate']=crsp_m1['date']+MonthEnd(-6) -crsp_m1['jyear']=crsp_m1['jdate'].dt.year - -crsp_m1 = pd.merge(crsp_m1.drop(['jdate'],axis=1), port4, how='left', on=['permno','jyear']) -crsp_m1 = crsp_m1.dropna(subset=['formdate','sizew','dgtw_port'], how='any') - -crsp_m1 = crsp_m1.sort_values(['date','dgtw_port','permno']) - -# function to calculate value weighted return -def wavg(group, avg_name, weight_name): - d = group[avg_name] - w = group[weight_name] - try: - return (d * w).sum() / w.sum() - except ZeroDivisionError: - return np.nan -# Calculate Weighted Average Returns -dgtw_vwret = crsp_m1.groupby(['date','dgtw_port']).apply(wavg, 'ret','sizew') -dgtw_vwret = dgtw_vwret.reset_index().rename(columns={0:'dgtw_vwret'}) - -# Calculate DGTW Excess Return -dgtw_returns = pd.merge(crsp_m1.drop(['sizew'], axis=1), dgtw_vwret, how='left', on =['dgtw_port','date']) -dgtw_returns['dgtw_xret']=dgtw_returns['ret']-dgtw_returns['dgtw_vwret'] -dgtw_returns = dgtw_returns.sort_values(['permno','date']).drop_duplicates() - -### output - -#dgtw_vwret.to_csv('dgtw_-py-vwret.csv') -dgtw_returns.to_csv('dgtw-py-xret.csv') -#crsp_m1.to_csv('dgtw-py-label.csv') diff --git a/py-ff3/ff3.py b/py-ff3/ff3.py deleted file mode 100755 index 528ce54..0000000 --- a/py-ff3/ff3.py +++ /dev/null @@ -1,280 +0,0 @@ -########################################## -# Fama French Factors -# April 2018 -# Qingyi (Freda) Song Drechsler -########################################## - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -import psycopg2 -import matplotlib.pyplot as plt -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -from scipy import stats - -################### -# Connect to WRDS # -################### -conn=wrds.Connection() - -################### -# Compustat Block # -################### -comp = conn.raw_sql(""" - select gvkey, datadate, at, pstkl, txditc, - pstkrv, seq, pstk - from comp.funda - where indfmt='INDL' - and datafmt='STD' - and popsrc='D' - and consol='C' - and datadate >= '01/01/1959' - """) - -comp['datadate']=pd.to_datetime(comp['datadate']) #convert datadate to date fmt -comp['year']=comp['datadate'].dt.year - -# create preferrerd stock -comp['ps']=np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv']) -comp['ps']=np.where(comp['ps'].isnull(),comp['pstk'], comp['ps']) -comp['ps']=np.where(comp['ps'].isnull(),0,comp['ps']) - -comp['txditc']=comp['txditc'].fillna(0) - -# create book equity -comp['be']=comp['seq']+comp['txditc']-comp['ps'] -comp['be']=np.where(comp['be']>0, comp['be'], np.nan) - -# number of years in Compustat -comp=comp.sort_values(by=['gvkey','datadate']) -comp['count']=comp.groupby(['gvkey']).cumcount() - -comp=comp[['gvkey','datadate','year','be','count']] - -################### -# CRSP Block # -################### -# sql similar to crspmerge macro -crsp_m = conn.raw_sql(""" - select a.permno, a.permco, a.date, b.shrcd, b.exchcd, - a.ret, a.retx, a.shrout, a.prc - from crsp.msf as a - left join crsp.msenames as b - on a.permno=b.permno - and b.namedt<=a.date - and a.date<=b.nameendt - where a.date between '01/01/1959' and '12/31/2017' - and b.exchcd between 1 and 3 - """) - -# change variable format to int -crsp_m[['permco','permno','shrcd','exchcd']]=crsp_m[['permco','permno','shrcd','exchcd']].astype(int) - -# Line up date to be end of month -crsp_m['date']=pd.to_datetime(crsp_m['date']) -crsp_m['jdate']=crsp_m['date']+MonthEnd(0) - -# add delisting return -dlret = conn.raw_sql(""" - select permno, dlret, dlstdt - from crsp.msedelist - """) -dlret.permno=dlret.permno.astype(int) -dlret['dlstdt']=pd.to_datetime(dlret['dlstdt']) -dlret['jdate']=dlret['dlstdt']+MonthEnd(0) - -crsp = pd.merge(crsp_m, dlret, how='left',on=['permno','jdate']) -crsp['dlret']=crsp['dlret'].fillna(0) -crsp['ret']=crsp['ret'].fillna(0) -crsp['retadj']=(1+crsp['ret'])*(1+crsp['dlret'])-1 -crsp['me']=crsp['prc'].abs()*crsp['shrout'] # calculate market equity -crsp=crsp.drop(['dlret','dlstdt','prc','shrout'], axis=1) -crsp=crsp.sort_values(by=['jdate','permco','me']) - -### Aggregate Market Cap ### -# sum of me across different permno belonging to same permco a given date -crsp_summe = crsp.groupby(['jdate','permco'])['me'].sum().reset_index() -# largest mktcap within a permco/date -crsp_maxme = crsp.groupby(['jdate','permco'])['me'].max().reset_index() -# join by jdate/maxme to find the permno -crsp1=pd.merge(crsp, crsp_maxme, how='inner', on=['jdate','permco','me']) -# drop me column and replace with the sum me -crsp1=crsp1.drop(['me'], axis=1) -# join with sum of me to get the correct market cap info -crsp2=pd.merge(crsp1, crsp_summe, how='inner', on=['jdate','permco']) -# sort by permno and date and also drop duplicates -crsp2=crsp2.sort_values(by=['permno','jdate']).drop_duplicates() - -# keep December market cap -crsp2['year']=crsp2['jdate'].dt.year -crsp2['month']=crsp2['jdate'].dt.month -decme=crsp2[crsp2['month']==12] -decme=decme[['permno','date','jdate','me','year']].rename(columns={'me':'dec_me'}) - -### July to June dates -crsp2['ffdate']=crsp2['jdate']+MonthEnd(-6) -crsp2['ffyear']=crsp2['ffdate'].dt.year -crsp2['ffmonth']=crsp2['ffdate'].dt.month -crsp2['1+retx']=1+crsp2['retx'] -crsp2=crsp2.sort_values(by=['permno','date']) - -# cumret by stock -crsp2['cumretx']=crsp2.groupby(['permno','ffyear'])['1+retx'].cumprod() -# lag cumret -crsp2['lcumretx']=crsp2.groupby(['permno'])['cumretx'].shift(1) - -# lag market cap -crsp2['lme']=crsp2.groupby(['permno'])['me'].shift(1) - -# if first permno then use me/(1+retx) to replace the missing value -crsp2['count']=crsp2.groupby(['permno']).cumcount() -crsp2['lme']=np.where(crsp2['count']==0, crsp2['me']/crsp2['1+retx'], crsp2['lme']) - -# baseline me -mebase=crsp2[crsp2['ffmonth']==1][['permno','ffyear', 'lme']].rename(columns={'lme':'mebase'}) - -# merge result back together -crsp3=pd.merge(crsp2, mebase, how='left', on=['permno','ffyear']) -crsp3['wt']=np.where(crsp3['ffmonth']==1, crsp3['lme'], crsp3['mebase']*crsp3['lcumretx']) - -decme['year']=decme['year']+1 -decme=decme[['permno','year','dec_me']] - -# Info as of June -crsp3_jun = crsp3[crsp3['month']==6] - -crsp_jun = pd.merge(crsp3_jun, decme, how='inner', on=['permno','year']) -crsp_jun=crsp_jun[['permno','date', 'jdate', 'shrcd','exchcd','retadj','me','wt','cumretx','mebase','lme','dec_me']] -crsp_jun=crsp_jun.sort_values(by=['permno','jdate']).drop_duplicates() - -####################### -# CCM Block # -####################### -ccm=conn.raw_sql(""" - select gvkey, lpermno as permno, linktype, linkprim, - linkdt, linkenddt - from crsp.ccmxpf_linktable - where substr(linktype,1,1)='L' - and (linkprim ='C' or linkprim='P') - """) - -ccm['linkdt']=pd.to_datetime(ccm['linkdt']) -ccm['linkenddt']=pd.to_datetime(ccm['linkenddt']) -# if linkenddt is missing then set to today date -ccm['linkenddt']=ccm['linkenddt'].fillna(pd.to_datetime('today')) - -ccm1=pd.merge(comp[['gvkey','datadate','be', 'count']],ccm,how='left',on=['gvkey']) -ccm1['yearend']=ccm1['datadate']+YearEnd(0) -ccm1['jdate']=ccm1['yearend']+MonthEnd(6) - -# set link date bounds -ccm2=ccm1[(ccm1['jdate']>=ccm1['linkdt'])&(ccm1['jdate']<=ccm1['linkenddt'])] -ccm2=ccm2[['gvkey','permno','datadate','yearend', 'jdate','be', 'count']] - -# link comp and crsp -ccm_jun=pd.merge(crsp_jun, ccm2, how='inner', on=['permno', 'jdate']) -ccm_jun['beme']=ccm_jun['be']*1000/ccm_jun['dec_me'] - -# select NYSE stocks for bucket breakdown -# exchcd = 1 and positive beme and positive me and shrcd in (10,11) and at least 2 years in comp -nyse=ccm_jun[(ccm_jun['exchcd']==1) & (ccm_jun['beme']>0) & (ccm_jun['me']>0) & (ccm_jun['count']>1) & ((ccm_jun['shrcd']==10) | (ccm_jun['shrcd']==11))] -# size breakdown -nyse_sz=nyse.groupby(['jdate'])['me'].median().to_frame().reset_index().rename(columns={'me':'sizemedn'}) -# beme breakdown -nyse_bm=nyse.groupby(['jdate'])['beme'].describe(percentiles=[0.3, 0.7]).reset_index() -nyse_bm=nyse_bm[['jdate','30%','70%']].rename(columns={'30%':'bm30', '70%':'bm70'}) - -nyse_breaks = pd.merge(nyse_sz, nyse_bm, how='inner', on=['jdate']) -# join back size and beme breakdown -ccm1_jun = pd.merge(ccm_jun, nyse_breaks, how='left', on=['jdate']) - - -# function to assign sz and bm bucket -def sz_bucket(row): - if row['me']==np.nan: - value='' - elif row['me']<=row['sizemedn']: - value='S' - else: - value='B' - return value - -def bm_bucket(row): - if 0<=row['beme']<=row['bm30']: - value = 'L' - elif row['beme']<=row['bm70']: - value='M' - elif row['beme']>row['bm70']: - value='H' - else: - value='' - return value - -# assign size portfolio -ccm1_jun['szport']=np.where((ccm1_jun['beme']>0)&(ccm1_jun['me']>0)&(ccm1_jun['count']>=1), ccm1_jun.apply(sz_bucket, axis=1), '') -# assign book-to-market portfolio -ccm1_jun['bmport']=np.where((ccm1_jun['beme']>0)&(ccm1_jun['me']>0)&(ccm1_jun['count']>=1), ccm1_jun.apply(bm_bucket, axis=1), '') -# create positivebmeme and nonmissport variable -ccm1_jun['posbm']=np.where((ccm1_jun['beme']>0)&(ccm1_jun['me']>0)&(ccm1_jun['count']>=1), 1, 0) -ccm1_jun['nonmissport']=np.where((ccm1_jun['bmport']!=''), 1, 0) - -# store portfolio assignment as of June -june=ccm1_jun[['permno','date', 'jdate', 'bmport','szport','posbm','nonmissport']] -june['ffyear']=june['jdate'].dt.year - -# merge back with monthly records -crsp3 = crsp3[['date','permno','shrcd','exchcd','retadj','me','wt','cumretx','ffyear','jdate']] -ccm3=pd.merge(crsp3, - june[['permno','ffyear','szport','bmport','posbm','nonmissport']], how='left', on=['permno','ffyear']) - -# keeping only records that meet the criteria -ccm4=ccm3[(ccm3['wt']>0)& (ccm3['posbm']==1) & (ccm3['nonmissport']==1) & - ((ccm3['shrcd']==10) | (ccm3['shrcd']==11))] - -############################ -# Form Fama French Factors # -############################ - -# function to calculate value weighted return -def wavg(group, avg_name, weight_name): - d = group[avg_name] - w = group[weight_name] - try: - return (d * w).sum() / w.sum() - except ZeroDivisionError: - return np.nan - -# value-weigthed return -vwret=ccm4.groupby(['jdate','szport','bmport']).apply(wavg, 'retadj','wt').to_frame().reset_index().rename(columns={0: 'vwret'}) -vwret['sbport']=vwret['szport']+vwret['bmport'] - -# firm count -vwret_n=ccm4.groupby(['jdate','szport','bmport'])['retadj'].count().reset_index().rename(columns={'retadj':'n_firms'}) -vwret_n['sbport']=vwret_n['szport']+vwret_n['bmport'] - -# tranpose -ff_factors=vwret.pivot(index='jdate', columns='sbport', values='vwret').reset_index() -ff_nfirms=vwret_n.pivot(index='jdate', columns='sbport', values='n_firms').reset_index() - -# create SMB and HML factors -ff_factors['WH']=(ff_factors['BH']+ff_factors['SH'])/2 -ff_factors['WL']=(ff_factors['BL']+ff_factors['SL'])/2 -ff_factors['WHML'] = ff_factors['WH']-ff_factors['WL'] - -ff_factors['WB']=(ff_factors['BL']+ff_factors['BM']+ff_factors['BH'])/3 -ff_factors['WS']=(ff_factors['SL']+ff_factors['SM']+ff_factors['SH'])/3 -ff_factors['WSMB'] = ff_factors['WS']-ff_factors['WB'] -ff_factors=ff_factors.rename(columns={'jdate':'date'}) - -# n firm count -ff_nfirms['H']=ff_nfirms['SH']+ff_nfirms['BH'] -ff_nfirms['L']=ff_nfirms['SL']+ff_nfirms['BL'] -ff_nfirms['HML']=ff_nfirms['H']+ff_nfirms['L'] - -ff_nfirms['B']=ff_nfirms['BL']+ff_nfirms['BM']+ff_nfirms['BH'] -ff_nfirms['S']=ff_nfirms['SL']+ff_nfirms['SM']+ff_nfirms['SH'] -ff_nfirms['SMB']=ff_nfirms['B']+ff_nfirms['S'] -ff_nfirms['TOTAL']=ff_nfirms['SMB'] -ff_nfirms=ff_nfirms.rename(columns={'jdate':'date'}) diff --git a/py-pead/pead.py b/py-pead/pead.py deleted file mode 100755 index 53ce1c7..0000000 --- a/py-pead/pead.py +++ /dev/null @@ -1,538 +0,0 @@ - -##################################### -# Post Earnings Announcement Drift # -# June 2019 # -# Qingyi (Freda) Song Drechsler # -##################################### - -import pandas as pd -import numpy as np -import wrds -import matplotlib.pyplot as plt -import pickle as pkl -from dateutil.relativedelta import * - -################### -# Connect to WRDS # -################### -conn=wrds.Connection() - -# set sample date range -begdate = '01/01/2010' -enddate = '12/31/2018' - -# set CRSP date range a bit wider to guarantee collecting all information -crsp_begdate = '01/01/2009' -crsp_enddate = '12/31/2019' - -################################# -# Step 0: Read in ICLINK output # -################################# - -# iclink.pkl is the output from the python program iclink -# it contains the linking between crsp and ibes -with open('iclink.pkl', 'rb') as f: - iclink = pkl.load(f) - -################################## -# Step 1. S&P 500 Index Universe # -################################## - -# All companies that were ever included in S&P 500 index as an example -# Linking Compustat GVKEY and IBES Tickers using ICLINK -# For unmatched GVKEYs, use header IBTIC link in Compustat Security file - -_sp500 = conn.raw_sql(""" select gvkey from comp.idxcst_his where gvkeyx='000003' """) - -_ccm = conn.raw_sql(""" select gvkey, lpermco as permco, lpermno as permno, linkdt, linkenddt - from crsp.ccmxpf_linktable - where usedflag=1 and linkprim in ('P', 'C')""") - -_ccm[['permco', 'permno']] = _ccm[['permco', 'permno']].astype(int) -_ccm['linkdt'] = pd.to_datetime(_ccm['linkdt']) -_ccm['linkenddt'] = pd.to_datetime(_ccm['linkenddt']) - -_sec = conn.raw_sql(""" select ibtic, gvkey from comp.security """) - - -import datetime -today = datetime.date.today() - -# Fill linkenddt missing value (.E in SAS dataset) with today's date -_ccm['linkenddt'] = _ccm.linkenddt.fillna(today) - -# Start the sequence of left join -gvkey = pd.merge(_sp500, _ccm, how='left', on=['gvkey']) -gvkey = pd.merge(gvkey, _sec.loc[_sec.ibtic.notna()], how='left', on=['gvkey']) - -# high quality links from iclink -# score = 0 or 1 -iclink_hq = iclink.loc[(iclink.score <=1)] - -gvkey = pd.merge(gvkey, iclink_hq, how='left', on=['permno']) - -# fill missing ticker with ibtic -gvkey.ticker = np.where(gvkey.ticker.notnull(), gvkey.ticker, gvkey.ibtic) - -# Keep relevant columns and drop duplicates if there is any -gvkey = gvkey[['gvkey', 'permco', 'permno', 'linkdt', 'linkenddt','ticker']] - -gvkey = gvkey.drop_duplicates() - -# date ranges from gvkey - -# min linkdt for ticker and permno combination -gvkey_mindt = gvkey.groupby(['ticker','permno']).linkdt.min().reset_index() - -# max linkenddt for ticker and permno combination -gvkey_maxdt = gvkey.groupby(['ticker','permno']).linkenddt.max().reset_index() - -# link date range -gvkey_dt = pd.merge(gvkey_mindt, gvkey_maxdt, how='inner', on=['ticker','permno']) - -####################################### -# Step 2. Extract Estimates from IBES # -####################################### - -# Extract estimates from IBES Unadjusted file and select -# the latest estimate for a firm within broker-analyst group -# "fpi in (6,7)" selects quarterly forecast for the current -# and the next fiscal quarter - -ibes_temp = conn.raw_sql(f""" - select ticker, estimator, analys, pdf, fpi, value, fpedats, revdats, revtims, anndats, anntims - from ibes.detu_epsus - where fpedats between '{begdate}' and '{enddate}' - and (fpi='6' or fpi='7') - """, date_cols = ['revdats', 'anndats', 'fpedats']) - -# merge to get date range linkdt and linkenddt to fulfill date requirement -ibes_temp = pd.merge(ibes_temp, gvkey_dt, how='left', on=['ticker']) -ibes_temp=ibes_temp.loc[(ibes_temp.linkdt<=ibes_temp.anndats) & (ibes_temp.anndats <= ibes_temp.linkenddt)] - -# Count number of estimates reported on primary/diluted basis - -p_sub = ibes_temp[['ticker','fpedats','pdf']].loc[ibes_temp.pdf=='P'] -d_sub = ibes_temp[['ticker','fpedats','pdf']].loc[ibes_temp.pdf=='D'] - -p_count = p_sub.groupby(['ticker','fpedats']).pdf.count().reset_index().rename(columns={'pdf':'p_count'}) -d_count = d_sub.groupby(['ticker','fpedats']).pdf.count().reset_index().rename(columns={'pdf':'d_count'}) - -ibes = pd.merge(ibes_temp, d_count, how = 'left', on=['ticker', 'fpedats']) -ibes = pd.merge(ibes, p_count, how='left', on =['ticker','fpedats']) -ibes['d_count'] = ibes.d_count.fillna(0) -ibes['p_count'] = ibes.p_count.fillna(0) - -# Determine whether most analysts report estimates on primary/diluted basis -# following Livnat and Mendenhall (2006) - -ibes['basis']=np.where(ibes.p_count>ibes.d_count, 'P', 'D') - -ibes = ibes.sort_values(by=['ticker','fpedats','estimator','analys','anndats', 'anntims', 'revdats', 'revtims'])\ -.drop(['linkdt', 'linkenddt','p_count','d_count', 'pdf', 'fpi'], axis=1) - -# Keep the latest observation for a given analyst -# Group by company fpedats estimator analys then pick the last record in the group - -ibes_1 = ibes.groupby(['ticker','fpedats','estimator','analys']).apply(lambda x: x.index[-1]).to_frame().reset_index() - -# reset index to the old dataframe index for join in the next step -ibes_1=ibes_1.set_index(0) - -# Inner join with the last analyst record per group -ibes = pd.merge(ibes, ibes_1[['analys']], left_index=True, right_index=True) - -# drop duplicate column -ibes=ibes.drop(['analys_y'], axis=1).rename(columns={'analys_x': 'analys'}) - -####################################### -# Step 3. Link Estimates with Actuals # -####################################### - -# Link Unadjusted estimates with Unadjusted actuals and CRSP permnos -# Keep only the estimates issued within 90 days before the report date - -# Getting actual piece of data -ibes_act = conn.raw_sql(f""" - select ticker, anndats as repdats, value as act, pends as fpedats, pdicity - from ibes.actu_epsus - where pends between '{begdate}' and '{enddate}' - and pdicity='QTR' - """, date_cols = ['repdats', 'fpedats']) - -# Join with the estimate piece of the data - -ibes1 = pd.merge(ibes, ibes_act, how='left', on = ['ticker','fpedats']) -ibes1['dgap'] = ibes1.repdats - ibes1.anndats - -ibes1['flag'] = np.where( (ibes1.dgap>=datetime.timedelta(days=0)) & (ibes1.dgap<=datetime.timedelta(days=90)) & (ibes1.repdats.notna()) & (ibes1.anndats.notna()), 1, 0) - -ibes1 = ibes1.loc[ibes1.flag==1].drop(['flag', 'dgap', 'pdicity'], axis=1) - - -# Select all relevant combinations of Permnos and Date - -ibes1_dt1 = ibes1[['permno', 'anndats']].drop_duplicates() - -ibes1_dt2 = ibes1[['permno', 'repdats']].drop_duplicates().rename(columns={'repdats':'anndats'}) - -ibes_anndats = pd.concat([ibes1_dt1, ibes1_dt2]).drop_duplicates() - -# Adjust all estimate and earnings announcement dates to the closest -# preceding trading date in CRSP to ensure that adjustment factors won't -# be missing after the merge - -# unique anndats from ibes -uniq_anndats = ibes_anndats[['anndats']].drop_duplicates() - -# unique trade dates from crsp.dsi -crsp_dats = conn.raw_sql(""" - select date - from crsp.dsi - """, date_cols=['date']) - -# Create up to 5 days prior dates relative to anndats - -for i in range(0, 5): - uniq_anndats[i] = uniq_anndats.anndats - datetime.timedelta(days=i) - -# reshape (transpose) the df for later join with crsp trading dates - -expand_anndats = uniq_anndats.set_index('anndats').stack().reset_index().\ -rename(columns={'level_1':'prior', 0:'prior_date'}) - -# merge with crsp trading dates -tradedates = pd.merge(expand_anndats, crsp_dats, how='left', left_on=['prior_date'], right_on=['date']) - -# create the dgap (days gap) variable for min selection -tradedates['dgap'] = tradedates.anndats-tradedates.date - -# choosing the row with the smallest dgap for a given anndats -tradedates = tradedates.loc[tradedates.groupby('anndats')['dgap'].idxmin()] - -tradedates = tradedates[['anndats', 'date']] - - -# merge the CRSP adjustment factors for all estimate and report dates - -# extract CRSP adjustment factors -cfacshr = conn.raw_sql(f""" - select permno, date, cfacshr - from crsp.dsf - where date between '{crsp_begdate}' and '{crsp_enddate}' - """, date_cols = ['date']) - -ibes_anndats = pd.merge(ibes_anndats, tradedates, how='left', on = ['anndats']) - -ibes_anndats = pd.merge(ibes_anndats, cfacshr, how='left', on=['permno', 'date']) - - -######################################### -# Step 4. Adjust Estimates with CFACSHR # -######################################### - -# Put the estimate on the same per share basis as -# company reported EPS using CRSP Adjustment factors. -# New_value is the estimate adjusted to be on the -# same basis with reported earnings. - -ibes1 = pd.merge(ibes1, ibes_anndats, how='inner', on=['permno', 'anndats']) -ibes1 = ibes1.drop(['anndats','date'], axis=1).rename(columns={'cfacshr':'cfacshr_ann'}) - -ibes1 = pd.merge(ibes1, ibes_anndats, how='inner', left_on=['permno', 'repdats'], right_on=['permno','anndats']) -ibes1 = ibes1.drop(['anndats','date'], axis=1).rename(columns={'cfacshr':'cfacshr_rep'}) - -ibes1['new_value'] = (ibes1.cfacshr_rep/ibes1.cfacshr_ann)*ibes1.value - -# Sanity check: there should be one most recent estimate for -# a given firm-fiscal period end combination -ibes1 = ibes1.sort_values(by=['ticker','fpedats','estimator','analys']).drop_duplicates() - -# Compute the median forecast based on estimates in the 90 days prior to the EAD - -grp_permno = ibes1.groupby(['ticker','fpedats', 'basis','repdats', 'act']).permno.max().reset_index() - -medest = ibes1.groupby(['ticker','fpedats', 'basis','repdats', 'act']).new_value.agg(['median','count']).reset_index() -medest = pd.merge(medest, grp_permno, how='inner', on=['ticker','fpedats','basis', 'repdats', 'act']) -medest = medest.rename(columns={'median': 'medest', 'count':'numest'}) - - -###################################### -# Step 5. Merge with Compustat Data # -###################################### - -# get items from fundq -fundq = conn.raw_sql(f""" - select gvkey, fyearq, fqtr, conm, datadate, rdq, epsfxq, epspxq, cshoq, prccq, - ajexq, spiq, cshoq, cshprq, cshfdq, saleq, atq, fyr, datafqtr, cshoq*prccq as mcap - from comp.fundq - where consol='C' and popsrc='D' and indfmt='INDL' and datafmt='STD' - and datadate between '{crsp_begdate}' and '{crsp_enddate}' - """, date_cols = ['datadate', 'datafqtr', 'rdq']) - -fundq = fundq.loc[((fundq.atq>0) | (fundq.saleq.notna())) & (fundq.datafqtr.notna())] - -# Calculate link date ranges for givken gvkey and ticker combination - -gvkey_mindt1 = gvkey.groupby(['gvkey', 'ticker']).linkdt.min().reset_index().rename(columns={'linkdt':'mindate'}) -gvkey_maxdt1 = gvkey.groupby(['gvkey', 'ticker']).linkenddt.max().reset_index().rename(columns={'linkenddt':'maxdate'}) -gvkey_dt1 = pd.merge(gvkey_mindt1, gvkey_maxdt1, how='inner', on=['gvkey','ticker']) - - -# Use the date range to merge -comp = pd.merge(fundq, gvkey_dt1, how='left', on =['gvkey']) -comp = comp.loc[(comp.ticker.notna()) & (comp.datadate<=comp.maxdate) & (comp.datadate>=comp.mindate)] - -# Merge with the median esitmates -comp = pd.merge(comp, medest, how = 'left', left_on=['ticker','datadate'], right_on=['ticker', 'fpedats']) - -# Sort data and drop duplicates -comp = comp.sort_values(by=['gvkey','fqtr','fyearq']).drop_duplicates() - - -########################### -# Step 6. Calculate SUEs # -########################### - -# block handling lag eps - -sue = comp.sort_values(by=['gvkey','fqtr','fyearq']) - -sue['dif_fyearq'] = sue.groupby(['gvkey', 'fqtr']).fyearq.diff() -sue['laggvkey'] = sue.gvkey.shift(1) - -# handling same qtr previous year - -cond_year = sue.dif_fyearq==1 # year increment is 1 - -sue['lagadj'] = np.where(cond_year, sue.ajexq.shift(1), None) -sue['lageps_p'] = np.where(cond_year, sue.epspxq.shift(1), None) -sue['lageps_d'] = np.where(cond_year, sue.epsfxq.shift(1), None) -sue['lagshr_p'] = np.where(cond_year, sue.cshprq.shift(1), None) -sue['lagshr_d'] = np.where(cond_year, sue.cshfdq.shift(1), None) -sue['lagspiq'] = np.where(cond_year, sue.spiq.shift(1), None) - -# handling first gvkey - -cond_gvkey = sue.gvkey != sue.laggvkey # first.gvkey - -sue['lagadj'] = np.where(cond_gvkey, None, sue.lagadj) -sue['lageps_p'] = np.where(cond_gvkey, None, sue.lageps_p) -sue['lageps_d'] = np.where(cond_gvkey, None, sue.lageps_d) -sue['lagshr_p'] = np.where(cond_gvkey, None, sue.lagshr_p) -sue['lagshr_d'] = np.where(cond_gvkey, None, sue.lagshr_d) -sue['lagspiq'] = np.where(cond_gvkey, None, sue.lagspiq) - - -# handling reporting basis - -# Basis = P and missing are treated the same - -sue['actual1'] = np.where(sue.basis=='D', sue.epsfxq/sue.ajexq, sue.epspxq/sue.ajexq) - -sue['actual2'] = np.where(sue.basis=='D', \ - (sue.epsfxq.fillna(0)-(0.65*sue.spiq/sue.cshfdq).fillna(0))/sue.ajexq, \ - (sue.epspxq.fillna(0)-(0.65*sue.spiq/sue.cshprq).fillna(0))/sue.ajexq - ) - -sue['expected1'] = np.where(sue.basis=='D', sue.lageps_d/sue.lagadj, sue.lageps_p/sue.lagadj) -sue['expected2'] = np.where(sue.basis=='D', \ - (sue.lageps_d.fillna(0)-(0.65*sue.lagspiq/sue.lagshr_d).fillna(0))/sue.lagadj, \ - (sue.lageps_p.fillna(0)-(0.65*sue.lagspiq/sue.lagshr_p).fillna(0))/sue.lagadj - ) - -# SUE calculations -sue['sue1'] = (sue.actual1 - sue.expected1) / (sue.prccq/sue.ajexq) -sue['sue2'] = (sue.actual2 - sue.expected2) / (sue.prccq/sue.ajexq) -sue['sue3'] = (sue.act - sue.medest) / sue.prccq - -sue = sue[['ticker','permno','gvkey','conm','fyearq','fqtr','fyr','datadate','repdats','rdq', \ - 'sue1','sue2','sue3','basis','act','medest','numest','prccq','mcap']] - - -# Shifting the announcement date to be the next trading day -# Defining the day after the following quarterly EA as leadrdq1 - -# unique rdq -uniq_rdq = comp[['rdq']].drop_duplicates() -uniq_rdq.shape - -# Create up to 5 days post rdq relative to rdq -for i in range(0, 5): - uniq_rdq[i] = uniq_rdq.rdq + datetime.timedelta(days=i) - -# reshape (transpose) for later join with crsp trading dates -expand_rdq = uniq_rdq.set_index('rdq').stack().reset_index().\ -rename(columns={'level_1':'post', 0:'post_date'}) - -# merge with crsp trading dates -eads1 = pd.merge(expand_rdq, crsp_dats, how='left', left_on=['post_date'], right_on=['date']) - -# create the dgap (days gap) variable for min selection -eads1['dgap'] = eads1.date-eads1.rdq -eads1 = eads1.loc[eads1.groupby('rdq')['dgap'].idxmin()].rename(columns={'date':'rdq1'}) - -# create sue_final -sue_final = pd.merge(sue, eads1[['rdq','rdq1']], how='left', on=['rdq']) -sue_final = sue_final.sort_values(by=['gvkey', 'fyearq','fqtr'], ascending=[True, False, False]).drop_duplicates() - -# Filter from Livnat & Mendenhall (2006): -#- earnings announcement date is reported in Compustat -#- the price per share is available from Compustat at fiscal quarter end -#- price is greater than $1 -#- the market (book) equity at fiscal quarter end is available and is -# EADs in Compustat and in IBES (if available)should not differ by more -# than one calendar day larger than $5 mil. - -sue_final['leadrdq1'] = sue_final.rdq1.shift(1) # next consecutive EAD -sue_final['leadgvkey'] = sue_final.gvkey.shift(1) - -# If first gvkey then leadrdq1 = rdq1+3 months -# Else leadrdq1 = previous rdq1 - -sue_final['leadrdq1'] = np.where(sue_final.gvkey == sue_final.leadgvkey, - sue_final.rdq1.shift(1), - sue_final.rdq1 + pd.offsets.MonthOffset(3)) - -sue_final['dgap'] = (sue_final.repdats - sue_final.rdq).fillna(0) -sue_final = sue_final.loc[(sue_final.rdq1 != sue_final.leadrdq1)] - -# Various conditioning for filtering -cond1 = (sue_final.sue1.notna()) & (sue_final.sue2.notna()) & (sue_final.repdats.isna()) -cond2 = (sue_final.repdats.notna()) & (sue_final.dgap<=datetime.timedelta(days=1)) & (sue_final.dgap>=datetime.timedelta(days=-1)) -sue_final = sue_final.loc[cond1 | cond2] - -# Impose restriction on price and marketcap -sue_final = sue_final.loc[(sue_final.rdq.notna()) & (sue_final.prccq>1) & (sue_final.mcap>5)] - -# Keep relevant columns -sue_final = sue_final[['gvkey', 'ticker','permno','conm',\ - 'fyearq','fqtr','datadate','fyr','rdq','rdq1','leadrdq1','repdats',\ - 'mcap','medest','act','numest','basis','sue1','sue2','sue3']] - - -######################################### -# Step 7. Form Portfolios Based on SUE # -######################################### - -# Extract file of raw daily returns around and between EADs and link them -# to Standardized Earnings Surprises for forming SUE-based portfolios - -# Records from dsf and dsi to calculate exret -dsf = conn.raw_sql(f""" - select permno, date, prc, abs(prc*shrout) as mcap, ret from crsp.dsf - where date between '{crsp_begdate}' and '{crsp_enddate}' - """, date_cols = ['date']) - -dsi = conn.raw_sql(f""" - select date, vwretd from crsp.dsi where date between '{crsp_begdate}' and '{crsp_enddate}' - """, date_cols=['date']) - -ds = pd.merge(dsf, dsi, how='left', on=['date']) -ds['exret'] = ds.ret - ds.vwretd -ds = ds.rename(columns={'vwretd':'mkt'}) - -# Records from sue_final that meet the condition -sue_final_join = sue_final.loc[(sue_final.rdq.notna()) & (sue_final.leadrdq1.notna()) & (sue_final.permno.notna()) \ - & (sue_final.leadrdq1-sue_final.rdq1>datetime.timedelta(days=30))] - -sue_final_join['lb_date'] = sue_final_join.rdq1-datetime.timedelta(days=5) -sue_final_join['ub_date'] = sue_final_join.leadrdq1+datetime.timedelta(days=5) - - -# left join ds with sue_final on permno first -# filter in the second step based on date range requirement -crsprets = pd.merge(ds, sue_final_join[['permno','rdq1', 'leadrdq1','sue1','sue2','sue3', 'lb_date','ub_date']], how='left', on=['permno']) - -# keep only records that meet the date range requirement -crsprets = crsprets.loc[(crsprets.date<=crsprets.ub_date) & (crsprets.date>=crsprets.lb_date)] -crsprets = crsprets.drop(['lb_date','ub_date'], axis=1) - - -# Alternative sql version to handle the join step of crsp return and sue_final -# Warning: sql runs very slow on python - -#import sqlite3 - -#sqlconn = sqlite3.connect(':memory') - -#sue_final_join.to_sql('sue_final_join_sql', sqlconn, index=False) -#ds.to_sql('ds_sql', sqlconn, index=False) - -#qry_stmt = """ -# select a.*, b.rdq1, b.leadrdq1, b.sue1, b.sue2, b.sue3 -# from ds_sql as a -# left join sue_final_join_sql as b -# on a.permno=b.permno and b.lb_date<=a.date<=b.ub_date -# """ - -#crsprets = pd.read_sql_query(qry_stmt, sqlconn) - -# To estimate the drift, sum daily returns over the period from -# 1 day after the earnings announcement through the day of -# the following quarterly earnings announcement - -temp = crsprets.sort_values(by=['permno', 'rdq1', 'date']) -temp['lpermno'] = temp.permno.shift(1) - -# If first permno then lagmcap = missing -# Else lagmcap = lag(mcap) -temp['lagmcap'] = np.where(temp.permno == temp.lpermno, - temp.mcap.shift(1), - None) - -temp = temp.loc[(temp.rdq1<=temp.date) & (temp.date<=temp.leadrdq1)] - -# create count variable within the group -temp['ncount'] = temp.groupby(['permno','rdq1']).cumcount() - -# Form quintiles based on SUE -peadrets = temp.sort_values(by=['ncount','permno','rdq1']).drop_duplicates() - -peadrets['sue1r']=peadrets.groupby('ncount')['sue1'].transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop')) -peadrets['sue2r']=peadrets.groupby('ncount')['sue2'].transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop')) -peadrets['sue3r']=peadrets.groupby('ncount')['sue3'].transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop')) - -# Form portfolios on Compustat-based SUEs (=sue1 or =sue2) or IBES-based SUE (=sue3) -# Code uses sue3 - -peadrets3 = peadrets.loc[peadrets.sue3r.notna()].sort_values(by=['ncount', 'sue3']) -peadrets3['sue3r'] = peadrets3['sue3r'].astype(int) - -# Form value-weighted exret -# Calculate group weight sum; -grp_lagmcap = peadrets3.groupby(['ncount','sue3r']).lagmcap.sum().reset_index().rename(columns={'lagmcap':'total_lagmcap'}) - -# join group weight sum back to the df -peadrets3 = pd.merge(peadrets3, grp_lagmcap, how='left', on=['ncount','sue3r']) - -# vw exret -peadrets3['wt_exret'] = peadrets3.exret * peadrets3.lagmcap/peadrets3.total_lagmcap -peadsue3port = peadrets3.groupby(['ncount', 'sue3r']).wt_exret.sum().reset_index() - - -# set ncount=0 all five portfolio weighted returns to be 0 -peadsue3port['wt_exret'] = np.where(peadsue3port.ncount==0, 0, peadsue3port.wt_exret) - -# transpose table for cumulative return calculation -peadsue3port = peadsue3port.pivot_table(index=['ncount'], columns='sue3r') - -# reset column index level -peadsue3port.columns = [col[1] for col in peadsue3port.columns] -peadsue3port = peadsue3port.reset_index() - -# keep only first 50 days after EADs -peadsue3port = peadsue3port.loc[peadsue3port.ncount<=50] - -# Cumulating Excess Returns - -peadsue3port['sueport1'] = peadsue3port[0].cumsum() -peadsue3port['sueport2'] = peadsue3port[1].cumsum() -peadsue3port['sueport3'] = peadsue3port[2].cumsum() -peadsue3port['sueport4'] = peadsue3port[3].cumsum() -peadsue3port['sueport5'] = peadsue3port[4].cumsum() - - -################### -# End of Program # -################### diff --git a/pychars/.DS_Store b/pychars/.DS_Store deleted file mode 100755 index 601b96deec00e82308fcdcaa772c3d454f88bc9e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKyG{c!5FA4!$ODuHL3t$=5~8FzAt(RU^U<#N5GX-RS$XEpPfVD^6I_UHYKy0yDjdl4@5Ka~_4_JF-56yWh(NkTxVmMD{ zdlusIfVD?YhYObv7c#qWLvfm&<7XNUmwPmC3YY?46= '01/01/1959' - """) - -# convert datadate to date fmt -comp['datadate'] = pd.to_datetime(comp['datadate']) - -# sort and clean up -comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() - -# clean up csho -comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) - -# calculate Compustat market equity -comp['mve_f'] = comp['csho'] * comp['prcc_f'] - -# do some clean up. several variables have lots of missing values -condlist = [comp['drc'].notna() & comp['drlt'].notna(), - comp['drc'].notna() & comp['drlt'].isnull(), - comp['drlt'].notna() & comp['drc'].isnull()] -choicelist = [comp['drc']+comp['drlt'], - comp['drc'], - comp['drlt']] -comp['dr'] = np.select(condlist, choicelist, default=np.nan) - -condlist = [comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].notna() & comp['dcpstk'] > comp['pstk'], - comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].isnull()] -choicelist = [comp['dcpstk']-comp['pstk'], - comp['dcpstk']] -comp['dc'] = np.select(condlist, choicelist, default=np.nan) -comp['dc'] = np.where(comp['dc'].isnull(), comp['dcvt'], comp['dc']) - -comp['xint0'] = np.where(comp['xint'].isnull(), 0, comp['xint']) -comp['xsga0'] = np.where(comp['xsga'].isnull, 0, 0) - -comp['ceq'] = np.where(comp['ceq'] == 0, np.nan, comp['ceq']) -comp['at'] = np.where(comp['at'] == 0, np.nan, comp['at']) -comp = comp.dropna(subset=['at']) - -####################################################################################################################### -# CRSP Block # -####################################################################################################################### -# Create a CRSP Subsample with Monthly Stock and Event Variables -# Restrictions will be applied later -# Select variables from the CRSP monthly stock and event datasets -crsp = conn.raw_sql(""" - select a.prc, a.ret, a.retx, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.date, a.permno, a.permco, - b.ticker, b.ncusip, b.shrcd, b.exchcd - from crsp.msf as a - left join crsp.msenames as b - on a.permno=b.permno - and b.namedt<=a.date - and a.date<=b.nameendt - where a.date >= '01/01/1959' - and b.exchcd between 1 and 3 - """) - -# change variable format to int -crsp[['permco', 'permno', 'shrcd', 'exchcd']] = crsp[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) -crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month - -crsp = crsp.dropna(subset=['prc']) -crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity - -# if Market Equity is Nan then let return equals to 0 -crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) -crsp['retx'] = np.where(crsp['me'].isnull(), 0, crsp['retx']) - -# impute me -crsp = crsp.sort_values(by=['permno', 'date']).drop_duplicates() -crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) - -# Aggregate Market Cap -''' -There are cases when the same firm (permco) has two or more securities (permno) at same date. -For the purpose of ME for the firm, we aggregated all ME for a given permco, date. -This aggregated ME will be assigned to the permno with the largest ME. -''' -# sum of me across different permno belonging to same permco a given date -crsp_summe = crsp.groupby(['monthend', 'permco'])['me'].sum().reset_index() -# largest mktcap within a permco/date -crsp_maxme = crsp.groupby(['monthend', 'permco'])['me'].max().reset_index() -# join by monthend/maxme to find the permno -crsp1 = pd.merge(crsp, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) -# drop me column and replace with the sum me -crsp1 = crsp1.drop(['me'], axis=1) -# join with sum of me to get the correct market cap info -crsp2 = pd.merge(crsp1, crsp_summe, how='inner', on=['monthend', 'permco']) -# sort by permno and date and also drop duplicates -crsp2 = crsp2.sort_values(by=['permno', 'monthend']).drop_duplicates() - -####################################################################################################################### -# CCM Block # -####################################################################################################################### -# merge CRSP and Compustat -# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/linking-databases/linking-crsp-and-compustat/ -ccm = conn.raw_sql(""" - select gvkey, lpermno as permno, linktype, linkprim, - linkdt, linkenddt - from crsp.ccmxpf_linktable - where substr(linktype,1,1)='L' - and (linkprim ='C' or linkprim='P') - """) - -ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) -ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) - -# if linkenddt is missing then set to today date -ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) - -# merge ccm and comp -ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) - -# we can only get the accounting data after the firm public their report -# for annual data, we ues 6 months lagged data -ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) -ccm1['jdate'] = ccm1['yearend'] + MonthEnd(6) - -# set link date bounds -ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] - -# link comp and crsp -crsp2 = crsp2.rename(columns={'monthend': 'jdate'}) -data_rawa = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) - -# filter exchcd & shrcd -data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & - ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] - -# process Market Equity -''' -Note: me is CRSP market equity, mve_f is Compustat market equity. Please choose the me below. -''' -data_rawa['me'] = data_rawa['me']/1000 # CRSP ME -# data_rawa['me'] = data_rawa['mve_f'] # Compustat ME - -# there are some ME equal to zero since this company do not have price or shares data, we drop these observations -data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) -data_rawa = data_rawa.dropna(subset=['me']) - -# count single stock years -# data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() - -# deal with the duplicates -data_rawa.loc[data_rawa.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 -data_rawa = data_rawa[data_rawa['temp'].notna()] -data_rawa.loc[data_rawa.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 -data_rawa = data_rawa[data_rawa['temp'].notna()] - -data_rawa = data_rawa.sort_values(by=['permno', 'jdate']) - -####################################################################################################################### -# Annual Variables # -####################################################################################################################### -# preferrerd stock -data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) -data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) -data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps']) - -data_rawa['txditc'] = data_rawa['txditc'].fillna(0) - -# book equity -data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] -data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) - -# acc -data_rawa['act_l1'] = data_rawa.groupby(['permno'])['act'].shift(1) -data_rawa['lct_l1'] = data_rawa.groupby(['permno'])['lct'].shift(1) - -condlist = [data_rawa['np'].isnull(), - data_rawa['act'].isnull() | data_rawa['lct'].isnull()] -choicelist = [((data_rawa['act']-data_rawa['lct'])-(data_rawa['act_l1']-data_rawa['lct_l1'])/(10*data_rawa['be'])), - (data_rawa['ib']-data_rawa['oancf'])/(10*data_rawa['be'])] -data_rawa['acc'] = np.select(condlist, - choicelist, - default=((data_rawa['act']-data_rawa['lct']+data_rawa['np'])- - (data_rawa['act_l1']-data_rawa['lct_l1']+data_rawa['np'].shift(1)))/(10*data_rawa['be'])) - -# agr -data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1) -data_rawa['agr'] = (data_rawa['at']-data_rawa['at_l1'])/data_rawa['at_l1'] - -# bm -data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] -data_rawa['bm_n'] = data_rawa['be'] - -# cfp -condlist = [data_rawa['dp'].isnull(), - data_rawa['ib'].isnull()] -choicelist = [data_rawa['ib']/data_rawa['me'], - np.nan] -data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) - -condlist = [data_rawa['dp'].isnull(), - data_rawa['ib'].isnull()] -choicelist = [data_rawa['ib'], - np.nan] -data_rawa['cfp_n'] = np.select(condlist, choicelist, default=data_rawa['ib']+data_rawa['dp']) - -# ep -data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] -data_rawa['ep_n'] = data_rawa['ib'] - -# ni -data_rawa['csho_l1'] = data_rawa.groupby(['permno'])['csho'].shift(1) -data_rawa['ajex_l1'] = data_rawa.groupby(['permno'])['ajex'].shift(1) -data_rawa['ni'] = np.where(data_rawa['gvkey'] != data_rawa['gvkey'].shift(1), - np.nan, - np.log(data_rawa['csho']*data_rawa['ajex']).replace(-np.inf, 0)- - np.log(data_rawa['csho_l1']*data_rawa['ajex_l1']).replace(-np.inf, 0)) - -# op -data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) -data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) -data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga']) - -condlist = [data_rawa['revt'].isnull(), data_rawa['be'].isnull()] -choicelist = [np.nan, np.nan] -data_rawa['op'] = np.select(condlist, choicelist, - default=(data_rawa['revt'] - data_rawa['cogs0'] - data_rawa['xsga0'] - data_rawa['xint0'])/data_rawa['be']) - -# rsup -data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) -data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] - -# sue -# data_rawa['ib_l1'] = data_rawa.groupby(['permno'])['ib'].shift(1) -# data_rawa['sue'] = (data_rawa['ib']-data_rawa['ib_l1'])/data_rawa['me'] - -# cash -data_rawa['cash'] = data_rawa['che']/data_rawa['at'] - -# lev -data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] - -# sp -data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] -data_rawa['sp_n'] = data_rawa['sale'] - -# rd_sale -data_rawa['rd_sale'] = data_rawa['xrd']/data_rawa['sale'] - -# rdm -data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] - -# adm hxz adm -data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] - -# gma -data_rawa['gma'] = (data_rawa['revt']-data_rawa['cogs'])/data_rawa['at_l1'] - -# chcsho -data_rawa['chcsho'] = (data_rawa['csho']/data_rawa['csho_l1'])-1 - -# lgr -data_rawa['lt_l1'] = data_rawa.groupby(['permno'])['lt'].shift(1) -data_rawa['lgr'] = (data_rawa['lt']/data_rawa['lt_l1'])-1 - -# pctacc -data_rawa['che_l1'] = data_rawa.groupby(['permno'])['che'].shift(1) -data_rawa['dlc_l1'] = data_rawa.groupby(['permno'])['dlc'].shift(1) -data_rawa['txp_l1'] = data_rawa.groupby(['permno'])['txp'].shift(1) - -condlist = [data_rawa['ib']==0, - data_rawa['oancf'].isnull(), - data_rawa['oancf'].isnull() & data_rawa['ib']==0] -choicelist = [(data_rawa['ib']-data_rawa['oancf'])/0.01, - ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1']))- - ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1']- - ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))/data_rawa['ib'].abs(), - ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1'])) - - ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1'] - - ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))] -data_rawa['pctacc'] = np.select(condlist, choicelist, default=(data_rawa['ib']-data_rawa['oancf'])/data_rawa['ib'].abs()) - -# age -# data_rawa['age'] = data_rawa['count'] - -# sgr -data_rawa['sgr'] = (data_rawa['sale']/data_rawa['sale_l1'])-1 - -# chpm -# data_rawa['chpm'] = (data_rawa['ib']/data_rawa['sale'])-(data_rawa['ib_l1']/data_rawa['sale_l1']) - -# chato -data_rawa['at_l2'] = data_rawa.groupby(['permno'])['at'].shift(2) -data_rawa['chato'] = (data_rawa['sale']/((data_rawa['at']+data_rawa['at_l1'])/2))-\ - (data_rawa['sale_l1']/((data_rawa['at']+data_rawa['at_l2'])/2)) - -# chtx -data_rawa['txt_l1'] = data_rawa.groupby(['permno'])['txt'].shift(1) -data_rawa['chtx'] = (data_rawa['txt']-data_rawa['txt_l1'])/data_rawa['at_l1'] - -# ala -# data_rawa['ala'] = data_rawa['che']+0.75*(data_rawa['act']-data_rawa['che'])-\ -# 0.5*(data_rawa['at']-data_rawa['act']-data_rawa['gdwl']-data_rawa['intan']) - -# alm -# data_rawa['alm'] = data_rawa['ala']/(data_rawa['at']+data_rawa['prcc_f']*data_rawa['csho']-data_rawa['ceq']) - -# noa -data_rawa['noa'] = ((data_rawa['at']-data_rawa['che']-data_rawa['ivao'].fillna(0))- - (data_rawa['at']-data_rawa['dlc'].fillna(0)-data_rawa['dltt'].fillna(0)-data_rawa['mib'].fillna(0) - -data_rawa['pstk'].fillna(0)-data_rawa['ceq'])/data_rawa['at_l1']) - -# rna -data_rawa['noa_l1'] = data_rawa.groupby(['permno'])['noa'].shift(1) -data_rawa['rna'] = data_rawa['oiadp']/data_rawa['noa_l1'] - -# pm -data_rawa['pm'] = data_rawa['oiadp']/data_rawa['sale'] - -# ato -data_rawa['ato'] = data_rawa['sale']/data_rawa['noa_l1'] - -# depr -data_rawa['depr'] = data_rawa['dp']/data_rawa['ppent'] - -# invest -data_rawa['ppent_l1'] = data_rawa.groupby(['permno'])['ppent'].shift(1) -data_rawa['invt_l1'] = data_rawa.groupby(['permno'])['invt'].shift(1) - -data_rawa['invest'] = np.where(data_rawa['ppegt'].isnull(), ((data_rawa['ppent']-data_rawa['ppent_l1'])+ - (data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1'], - ((data_rawa['ppegt']-data_rawa['ppent_l1'])+(data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1']) - -# egr -data_rawa['ceq_l1'] = data_rawa.groupby(['permno'])['ceq'].shift(1) -data_rawa['egr'] = ((data_rawa['ceq']-data_rawa['ceq_l1'])/data_rawa['ceq_l1']) - -# cashdebt -data_rawa['cashdebt'] = (data_rawa['ib']+data_rawa['dp'])/((data_rawa['lt']+data_rawa['lt_l1'])/2) - -# # grltnoa -# lag_a['aco'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['aco'].shift(1), np.nan) -# lag_a['intan'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['intan'].shift(1), np.nan) -# lag_a['ao'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['ao'].shift(1), np.nan) -# lag_a['ap'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['ap'].shift(1), np.nan) -# lag_a['lco'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['lco'].shift(1), np.nan) -# lag_a['lo'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['lo'].shift(1), np.nan) -# lag_a['rect'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['rect'].shift(1), np.nan) -# -# data_rawa['grltnoa'] = ((data_rawa['rect']+data_rawa['invt']+data_rawa['ppent']+data_rawa['aco']+data_rawa['intan']+ -# data_rawa['ao']-data_rawa['ap']-data_rawa['lco']-data_rawa['lo'])- -# (lag_a['rect']+lag_a['invt']+lag_a['ppent']+lag_a['aco']+lag_a['intan']+lag_a['ao']-lag_a['ap']- -# lag_a['lco']-lag_a['lo'])-\ -# (data_rawa['rect']-lag_a['rect']+data_rawa['invt']-lag_a['invt']+data_rawa['aco']-lag_a['aco']- -# (data_rawa['ap']-lag_a['ap']+data_rawa['lco']-lag_a['lco'])-data_rawa['dp']))/((data_rawa['at']+lag_a['at'])/2) - -# rd -# if ((xrd/at)-(lag(xrd/lag(at))))/(lag(xrd/lag(at))) >.05 then rd=1; else rd=0; -data_rawa['xrd/at_l1'] = data_rawa['xrd']/data_rawa['at_l1'] -data_rawa['xrd/at_l1_l1'] = data_rawa.groupby(['permno'])['xrd/at_l1'].shift(1) -data_rawa['rd'] = np.where(((data_rawa['xrd']/data_rawa['at'])- - (data_rawa['xrd/at_l1_l1']))/data_rawa['xrd/at_l1_l1']>0.05, 1, 0) - -# roa -data_rawa['roa'] = data_rawa['ni']/((data_rawa['at']+data_rawa['at_l1'])/2) - -# roe -data_rawa['roe'] = data_rawa['ib']/data_rawa['ceq_l1'] - -# dy -data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] - -# Annual Accounting Variables -chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', - 'sic', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', 'rsup', 'cash', 'chcsho', - 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'adm', 'sgr', 'sp', 'invest', 'roe', - 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'chato', 'chtx', 'noa', 'rna', 'pm', 'ato', 'dy']] -chars_a.reset_index(drop=True, inplace=True) -####################################################################################################################### -# Compustat Quarterly Raw Info # -####################################################################################################################### -comp = conn.raw_sql(""" - /*header info*/ - select c.gvkey, f.cusip, f.datadate, f.fyearq, substr(c.sic,1,2) as sic2, c.sic, f.fqtr, f.rdq, - - /*income statement*/ - f.ibq, f.saleq, f.txtq, f.revtq, f.cogsq, f.xsgaq, f.revty, f.cogsy, f.saley, - - /*balance sheet items*/ - f.atq, f.actq, f.cheq, f.lctq, f.dlcq, f.ppentq, f.ppegtq, - - /*others*/ - abs(f.prccq) as prccq, abs(f.prccq)*f.cshoq as mveq_f, f.ceqq, f.seqq, f.pstkq, f.ltq, - f.pstkrq, f.gdwlq, f.intanq, f.mibq, f.oiadpq, f.ivaoq, - - /* v3 my formula add*/ - f.ajexq, f.cshoq, f.txditcq, f.npq, f.xrdy, f.xrdq, f.dpq, f.xintq, f.invtq, f.scstkcy, f.niq, - f.oancfy, f.dlttq - - from comp.fundq as f - left join comp.company as c - on f.gvkey = c.gvkey - - /*get consolidated, standardized, industrial format statements*/ - where f.indfmt = 'INDL' - and f.datafmt = 'STD' - and f.popsrc = 'D' - and f.consol = 'C' - and f.datadate >= '01/01/1959' - """) - -# comp['cusip6'] = comp['cusip'].str.strip().str[0:6] -comp = comp.dropna(subset=['ibq']) - -# sort and clean up -comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() -comp['cshoq'] = np.where(comp['cshoq'] == 0, np.nan, comp['cshoq']) -comp['ceqq'] = np.where(comp['ceqq'] == 0, np.nan, comp['ceqq']) -comp['atq'] = np.where(comp['atq'] == 0, np.nan, comp['atq']) -comp = comp.dropna(subset=['atq']) - -# convert datadate to date fmt -comp['datadate'] = pd.to_datetime(comp['datadate']) - -# merge ccm and comp -ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) -ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) -ccm1['jdate'] = ccm1['datadate'] + MonthEnd(3) # we change quarterly lag here -# ccm1['jdate'] = ccm1['datadate']+MonthEnd(4) - -# set link date bounds -ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] - -# merge ccm2 and crsp2 -# crsp2['jdate'] = crsp2['monthend'] -data_rawq = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) - -# filter exchcd & shrcd -data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & - ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] - -# process Market Equity -''' -Note: me is CRSP market equity, mveq_f is Compustat market equity. Please choose the me below. -''' -data_rawq['me'] = data_rawq['me']/1000 # CRSP ME -# data_rawq['me'] = data_rawq['mveq_f'] # Compustat ME - -# there are some ME equal to zero since this company do not have price or shares data, we drop these observations -data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) -data_rawq = data_rawq.dropna(subset=['me']) - -# count single stock years -# data_rawq['count'] = data_rawq.groupby(['gvkey']).cumcount() - -# deal with the duplicates -data_rawq.loc[data_rawq.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 -data_rawq = data_rawq[data_rawq['temp'].notna()] -data_rawq.loc[data_rawq.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 -data_rawq = data_rawq[data_rawq['temp'].notna()] - -data_rawq = data_rawq.sort_values(by=['permno', 'jdate']) - -####################################################################################################################### -# Quarterly Variables # -####################################################################################################################### -# prepare be -data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) -data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) - -# dy -data_rawq['me_l1'] = data_rawq.groupby(['permno'])['me'].shift(1) -data_rawq['retdy'] = data_rawq['ret'] - data_rawq['retx'] -data_rawq['mdivpay'] = data_rawq['retdy']*data_rawq['me_l1'] - -data_rawq['dy'] = ttm12(series='mdivpay', df=data_rawq)/data_rawq['me'] - -# # pstk -# chars_q['pstk'] = np.where(data_rawq['pstkrq'].notna(), data_rawq['pstkrq'], data_rawq['pstkq']) -# -# # scal -# condlist = [data_rawq['seqq'].isnull(), -# data_rawq['seqq'].isnull() & (data_rawq['ceqq'].isnull() | chars_q['pstk'].isnull())] -# choicelist = [data_rawq['ceqq']+chars_q['pstk'], -# data_rawq['atq']-data_rawq['ltq']] -# chars_q['scal'] = np.select(condlist, choicelist, default=data_rawq['seqq']) - -# chtx -data_rawq['txtq_l4'] = data_rawq.groupby(['permno'])['txtq'].shift(4) -data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) -data_rawq['chtx'] = (data_rawq['txtq']-data_rawq['txtq_l4'])/data_rawq['atq_l4'] - -# roa -data_rawq['atq_l1'] = data_rawq.groupby(['permno'])['atq'].shift(1) -data_rawq['roa'] = data_rawq['ibq']/data_rawq['atq_l1'] - -# cash -data_rawq['cash'] = data_rawq['cheq']/data_rawq['atq'] - -# acc -data_rawq['actq_l4'] = data_rawq.groupby(['permno'])['actq'].shift(4) -data_rawq['lctq_l4'] = data_rawq.groupby(['permno'])['lctq'].shift(4) -data_rawq['npq_l4'] = data_rawq.groupby(['permno'])['npq'].shift(4) -condlist = [data_rawq['npq'].isnull(), - data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] -choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/(10*data_rawq['beq']), - np.nan] -data_rawq['acc'] = np.select(condlist, choicelist, - default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])- - (data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/(10*data_rawq['beq'])) - -# bm -data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] -data_rawq['bm_n'] = data_rawq['beq'] - -# cfp -data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), - ttm4('ibq', data_rawq)/data_rawq['me'], - (ttm4('ibq', data_rawq)+ttm4('dpq', data_rawq))/data_rawq['me']) -data_rawq['cfp_n'] = data_rawq['cfp']*data_rawq['me'] - -# ep -data_rawq['ep'] = ttm4('ibq', data_rawq)/data_rawq['me'] -data_rawq['ep_n'] = data_rawq['ep']*data_rawq['me'] - -# agr -data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'] - -# ni -data_rawq['cshoq_l4'] = data_rawq.groupby(['permno'])['cshoq'].shift(4) -data_rawq['ajexq_l4'] = data_rawq.groupby(['permno'])['ajexq'].shift(4) -data_rawq['ni'] = np.where(data_rawq['cshoq'].isnull(), np.nan, - np.log(data_rawq['cshoq']*data_rawq['ajexq']).replace(-np.inf, 0)-np.log(data_rawq['cshoq_l4']*data_rawq['ajexq_l4'])) - -# op -data_rawq['xintq0'] = np.where(data_rawq['xintq'].isnull(), 0, data_rawq['xintq']) -data_rawq['xsgaq0'] = np.where(data_rawq['xsgaq'].isnull(), 0, data_rawq['xsgaq']) -data_rawq['beq_l4'] = data_rawq.groupby(['permno'])['beq'].shift(4) - -data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] - -# sue -# data_rawq['ibq_l4'] = data_rawq.groupby(['permno'])['ibq'].shift(4) -# data_rawq['sue'] = (data_rawq['ibq']-data_rawq['ibq_l4'])/data_rawq['me'].abs() - -# csho -data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1 - -# cashdebt -data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4) -data_rawq['cashdebt'] = (ttm4('ibq', data_rawq) + ttm4('dpq', data_rawq))/((data_rawq['ltq']+data_rawq['ltq_l4'])/2) - -# rd -data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) -data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4']) - -data_rawq['xrdq4/atq_l4'] = data_rawq['xrdq4']/data_rawq['atq_l4'] -data_rawq['xrdq4/atq_l4_l4'] = data_rawq.groupby(['permno'])['xrdq4/atq_l4'].shift(4) -data_rawq['rd'] = np.where(((data_rawq['xrdq4']/data_rawq['atq'])-data_rawq['xrdq4/atq_l4_l4'])/data_rawq['xrdq4/atq_l4_l4']>0.05, 1, 0) - -# pctacc -condlist = [data_rawq['npq'].isnull(), - data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] -choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/abs(ttm4('ibq', data_rawq)), np.nan] -data_rawq['pctacc'] = np.select(condlist, choicelist, - default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/ - abs(ttm4('ibq', data_rawq))) - -# gma -data_rawq['revtq4'] = ttm4('revtq', data_rawq) -data_rawq['cogsq4'] = ttm4('cogsq', data_rawq) -data_rawq['gma'] = (data_rawq['revtq4']-data_rawq['cogsq4'])/data_rawq['atq_l4'] - -# lev -data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] - -# rdm -data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] - -# sgr -data_rawq['saleq4'] = ttm4('saleq', data_rawq) -data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) - -data_rawq['saleq4_l4'] = data_rawq.groupby(['permno'])['saleq4'].shift(4) -data_rawq['sgr'] = (data_rawq['saleq4']/data_rawq['saleq4_l4'])-1 - -# sp -data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] -data_rawq['sp_n'] = data_rawq['saleq4'] - -# invest -data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) -data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) -data_rawq['ppegtq_l4'] = data_rawq.groupby(['permno'])['ppegtq'].shift(4) - -data_rawq['invest'] = np.where(data_rawq['ppegtq'].isnull(), ((data_rawq['ppentq']-data_rawq['ppentq_l4'])+ - (data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4'], - ((data_rawq['ppegtq']-data_rawq['ppegtq_l4'])+(data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4']) - -# rd_sale -data_rawq['rd_sale'] = data_rawq['xrdq4']/data_rawq['saleq4'] - -# lgr -data_rawq['lgr'] = (data_rawq['ltq']/data_rawq['ltq_l4'])-1 - -# depr -data_rawq['depr'] = ttm4('dpq', data_rawq)/data_rawq['ppentq'] - -# egr -data_rawq['ceqq_l4'] = data_rawq.groupby(['permno'])['ceqq'].shift(4) -data_rawq['egr'] = (data_rawq['ceqq']-data_rawq['ceqq_l4'])/data_rawq['ceqq_l4'] - -# grltnoa -# lag_q['rectq4'] = np.where(data_rawq['gvkey'] == data_rawq['gvkey'].shift(4), data_rawq['rectq'].shift(4), np.nan) -# lag_q['acoq4'] = np.where(data_rawq['gvkey'] == data_rawq['gvkey'].shift(4), data_rawq['acoq'].shift(4), np.nan) -# lag_q['apq4'] = np.where(data_rawq['gvkey'] == data_rawq['gvkey'].shift(4), data_rawq['apq'].shift(4), np.nan) -# lag_q['lcoq4'] = np.where(data_rawq['gvkey'] == data_rawq['gvkey'].shift(4), data_rawq['lcoq'].shift(4), np.nan) -# lag_q['loq4'] = np.where(data_rawq['gvkey'] == data_rawq['gvkey'].shift(4), data_rawq['loq'].shift(4), np.nan) -# -# chars_q['grltnoa'] = ((data_rawq['rectq']+data_rawq['invtq']+data_rawq['ppentq']+data_rawq['acoq']+data_rawq['intanq']+ -# data_rawq['aoq']-data_rawq['apq']-data_rawq['lcoq']-data_rawq['loq'])- -# (lag_q['rectq4']+lag_q['invtq4']+lag_q['ppentq4']+lag_q['acoq4']-lag_q['apq4']-lag_q['lcoq4']-lag_q['loq4'])-\ -# (data_rawq['rectq']-lag_q['rectq4']+data_rawq['invtq']-lag_q['invtq4']+data_rawq['acoq']- -# (data_rawq['apq']-lag_q['apq4']+data_rawq['lcoq']-lag_q['lcoq4'])- -# ttm4('dpq', data_rawq)))/((data_rawq['atq']+lag_q['atq4'])/2) - -# chpm -data_rawq['ibq4'] = ttm4('ibq', data_rawq) -data_rawq['ibq4_l1'] = data_rawq.groupby(['permno'])['ibq4'].shift(1) -data_rawq['saleq4_l1'] = data_rawq.groupby(['permno'])['saleq4'].shift(1) - -data_rawq['chpm'] = (data_rawq['ibq4']/data_rawq['saleq4'])-(data_rawq['ibq4_l1']/data_rawq['saleq4_l1']) - -# chato -data_rawq['atq_l8'] = data_rawq.groupby(['permno'])['atq'].shift(8) -data_rawq['chato'] = (data_rawq['saleq4']/((data_rawq['atq']+data_rawq['atq_l4'])/2))-(data_rawq['saleq4_l4']/((data_rawq['atq_l4']+data_rawq['atq_l8'])/2)) - -# ala -# data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ -# 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) - -# alm -# data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) - -# noa -data_rawq['ivaoq'] = np.where(data_rawq['ivaoq'].isnull(), 0, 1) -data_rawq['dlcq'] = np.where(data_rawq['dlcq'].isnull(), 0, 1) -data_rawq['dlttq'] = np.where(data_rawq['dlttq'].isnull(), 0, 1) -data_rawq['mibq'] = np.where(data_rawq['mibq'].isnull(), 0, 1) -data_rawq['pstkq'] = np.where(data_rawq['pstkq'].isnull(), 0, 1) -data_rawq['noa'] = (data_rawq['atq']-data_rawq['cheq']-data_rawq['ivaoq'])-\ - (data_rawq['atq']-data_rawq['dlcq']-data_rawq['dlttq']-data_rawq['mibq']-data_rawq['pstkq']-data_rawq['ceqq'])/data_rawq['atq_l4'] - -# rna -data_rawq['noa_l4'] = data_rawq.groupby(['permno'])['noa'].shift(4) -data_rawq['rna'] = data_rawq['oiadpq']/data_rawq['noa_l4'] - -# pm -data_rawq['pm'] = data_rawq['oiadpq']/data_rawq['saleq'] - -# ato -data_rawq['ato'] = data_rawq['saleq']/data_rawq['noa_l4'] - -# roe -data_rawq['ceqq_l1'] = data_rawq.groupby(['permno'])['ceqq'].shift(1) -data_rawq['roe'] = data_rawq['ibq']/data_rawq['ceqq_l1'] - -# Quarterly Accounting Variables -chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd', 'acc', 'bm', 'cfp', - 'ep', 'agr', 'ni', 'op', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', - 'rdm', 'sgr', 'sp', 'invest', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'roe', - 'chato', 'chpm', 'chtx', 'noa', 'rna', 'pm', 'ato']] -chars_q.reset_index(drop=True, inplace=True) - -####################################################################################################################### -# Momentum # -####################################################################################################################### -crsp_mom = conn.raw_sql(""" - select permno, date, ret, retx, prc, shrout - from crsp.msf - where date >= '01/01/1959' - """) - -crsp_mom['permno'] = crsp_mom['permno'].astype(int) -crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) -crsp_mom = crsp_mom.dropna(subset=['ret', 'retx', 'prc']) - -# add delisting return -dlret = conn.raw_sql(""" - select permno, dlret, dlstdt - from crsp.msedelist - """) - -dlret.permno = dlret.permno.astype(int) -dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) -dlret['jdate'] = dlret['dlstdt'] + MonthEnd(0) - -# merge delisting return to crsp return -crsp_mom = pd.merge(crsp_mom, dlret, how='left', on=['permno', 'jdate']) -crsp_mom['dlret'] = crsp_mom['dlret'].fillna(0) -crsp_mom['ret'] = crsp_mom['ret'].fillna(0) -crsp_mom['retadj'] = (1 + crsp_mom['ret']) * (1 + crsp_mom['dlret']) - 1 -crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity -crsp_mom = crsp_mom.drop(['dlret', 'dlstdt', 'prc', 'shrout'], axis=1) - - -def mom(start, end, df): - """ - - :param start: Order of starting lag - :param end: Order of ending lag - :param df: Dataframe - :return: Momentum factor - """ - lag = pd.DataFrame() - result = 1 - for i in range(start, end): - lag['mom%s' % i] = df.groupby(['permno'])['ret'].shift(i) - result = result * (1+lag['mom%s' % i]) - result = result - 1 - return result - - -crsp_mom['mom60m'] = mom(12, 60, crsp_mom) -crsp_mom['mom12m'] = mom(1, 12, crsp_mom) -crsp_mom['mom1m'] = crsp_mom['ret'] -crsp_mom['mom6m'] = mom(1, 6, crsp_mom) -crsp_mom['mom36m'] = mom(1, 36, crsp_mom) -crsp_mom['seas1a'] = crsp_mom.groupby(['permno'])['ret'].shift(11) - - -# def moms(start, end, df): -# """ -# -# :param start: Order of starting lag -# :param end: Order of ending lag -# :param df: Dataframe -# :return: Momentum factor -# """ -# lag = pd.DataFrame() -# result = 1 -# for i in range(start, end): -# lag['moms%s' % i] = df.groupby['permno']['ret'].shift(i) -# result = result + lag['moms%s' % i] -# result = result/11 -# return result -# -# -# crsp_mom['moms12m'] = moms(1, 12, crsp_mom) - -# populate the chars to monthly - -# chars_a -chars_a = pd.merge(crsp_mom, chars_a, how='left', on=['permno', 'jdate']) -chars_a['datadate'] = chars_a.groupby(['permno'])['datadate'].fillna(method='ffill') -chars_a = chars_a.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') -chars_a = chars_a[((chars_a['exchcd'] == 1) | (chars_a['exchcd'] == 2) | (chars_a['exchcd'] == 3)) & - ((chars_a['shrcd'] == 10) | (chars_a['shrcd'] == 11))] - -# chars_q -chars_q = pd.merge(crsp_mom, chars_q, how='left', on=['permno', 'jdate']) -chars_q['datadate'] = chars_q.groupby(['permno'])['datadate'].fillna(method='ffill') -chars_q = chars_q.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') -chars_q = chars_q[((chars_q['exchcd'] == 1) | (chars_q['exchcd'] == 2) | (chars_q['exchcd'] == 3)) & - ((chars_q['shrcd'] == 10) | (chars_q['shrcd'] == 11))] - -with open('chars_a.pkl', 'wb') as f: - pkl.dump(chars_a, f) - -with open('chars_q.pkl', 'wb') as f: - pkl.dump(chars_q, f) \ No newline at end of file diff --git a/pychars/beta.py b/pychars/beta.py deleted file mode 100755 index 583806a..0000000 --- a/pychars/beta.py +++ /dev/null @@ -1,70 +0,0 @@ -# BETA monthly version -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf - from crsp.msf as a - left join ff.factors_daily as b - on a.date=b.date - where a.date > '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -###################### -# Calculate the beta # -###################### -rolling_window = 60 # 60 months - - -# TODO: find a faster way to get rolling sub dataframe -def get_beta(df): - """ - The original idea of calculate beta is using formula (X'MX)^(-1)X'MY, - where M = I - 1(1'1)^{-1}1, I is a identity matrix. - - """ - temp = crsp.loc[df.index] # extract the rolling sub dataframe from original dataframe - X = np.mat(temp[['mktrf']]) - Y = np.mat(temp[['exret']]) - ones = np.mat(np.ones(rolling_window)).T - M = np.identity(rolling_window) - ones.dot((ones.T.dot(ones)).I).dot(ones.T) - beta = (X.T.dot(M).dot(X)).I.dot((X.T.dot(M).dot(Y))) - return beta - - -# calculate beta through rolling window -crsp_temp = crsp.groupby('permno').rolling(rolling_window).apply(get_beta, raw=False) - -# arrange final outcome -crsp_temp = crsp_temp[['mktrf']] # all columns values are beta, we drop extra columns here -crsp_temp = crsp_temp.rename(columns={'mktrf': 'beta'}) -crsp_temp = crsp_temp.reset_index() -crsp['beta'] = crsp_temp['beta'] -crsp = crsp.dropna(subset=['beta']) # drop NA due to rolling -crsp = crsp[['permno', 'date', 'beta']] - -with open('beta.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/pychars/functions.py b/pychars/functions.py deleted file mode 100755 index 4ad6b40..0000000 --- a/pychars/functions.py +++ /dev/null @@ -1,445 +0,0 @@ -import pandas as pd -import pickle as pkl -import numpy as np -import re - -def ffi49(df): - condlist = [((100 <= df['sic']) & (df['sic'] <= 199)) | ((200 <= df['sic']) & (df['sic'] <= 299)) | - ((700 <= df['sic']) & (df['sic'] <= 799)) | ((910 <= df['sic']) & (df['sic'] <= 919)) | - ((2048 <= df['sic']) & (df['sic'] <= 2048)), - ((2000 <= df['sic']) & (df['sic'] <= 2009)) | ((2010 <= df['sic']) & (df['sic'] <= 2019)) | - ((2020 <= df['sic']) & (df['sic'] <= 2029)) | ((2030 <= df['sic']) & (df['sic'] <= 2039)) | - ((2040 <= df['sic']) & (df['sic'] <= 2046)) | ((2050 <= df['sic']) & (df['sic'] <= 2059)) | - ((2060 <= df['sic']) & (df['sic'] <= 2063)) | ((2070 <= df['sic']) & (df['sic'] <= 2079)) | - ((2090 <= df['sic']) & (df['sic'] <= 2092)) | ((2095 <= df['sic']) & (df['sic'] <= 2095)) | - ((2098 <= df['sic']) & (df['sic'] <= 2099)), - ((2064 <= df['sic']) & (df['sic'] <= 2068)) | ((2086 <= df['sic']) & (df['sic'] <= 2086)) | - ((2087 <= df['sic']) & (df['sic'] <= 2087)) | ((2096 <= df['sic']) & (df['sic'] <= 2096)) | - ((2097 <= df['sic']) & (df['sic'] <= 2097)), - ((2080 <= df['sic']) & (df['sic'] <= 2080)) | ((2082 <= df['sic']) & (df['sic'] <= 2082)) | - ((2083 <= df['sic']) & (df['sic'] <= 2083)) | ((2084 <= df['sic']) & (df['sic'] <= 2084)) | - ((2085 <= df['sic']) & (df['sic'] <= 2085)), - ((2100 <= df['sic']) & (df['sic'] <= 2199)), - ((920 <= df['sic']) & (df['sic'] <= 999)) | ((3650 <= df['sic']) & (df['sic'] <= 3651)) | - ((3652 <= df['sic']) & (df['sic'] <= 3652)) | ((3732 <= df['sic']) & (df['sic'] <= 3732)) | - ((3930 <= df['sic']) & (df['sic'] <= 3931)) | ((3940 <= df['sic']) & (df['sic'] <= 3949)), - ((7800 <= df['sic']) & (df['sic'] <= 7829)) | ((7830 <= df['sic']) & (df['sic'] <= 7833)) | - ((7840 <= df['sic']) & (df['sic'] <= 7841)) | ((7900 <= df['sic']) & (df['sic'] <= 7900)) | - ((7910 <= df['sic']) & (df['sic'] <= 7911)) | ((7920 <= df['sic']) & (df['sic'] <= 7929)) | - ((7930 <= df['sic']) & (df['sic'] <= 7933)) | ((7940 <= df['sic']) & (df['sic'] <= 7949)) | - ((7980 <= df['sic']) & (df['sic'] <= 7980)) | ((7990 <= df['sic']) & (df['sic'] <= 7999)), - ((2700 <= df['sic']) & (df['sic'] <= 2709)) | ((2710 <= df['sic']) & (df['sic'] <= 2719)) | - ((2720 <= df['sic']) & (df['sic'] <= 2729)) | ((2730 <= df['sic']) & (df['sic'] <= 2739)) | - ((2740 <= df['sic']) & (df['sic'] <= 2749)) | ((2770 <= df['sic']) & (df['sic'] <= 2771)) | - ((2780 <= df['sic']) & (df['sic'] <= 2789)) | ((2790 <= df['sic']) & (df['sic'] <= 2799)), - ((2047 <= df['sic']) & (df['sic'] <= 2047)) | ((2391 <= df['sic']) & (df['sic'] <= 2392)) | - ((2510 <= df['sic']) & (df['sic'] <= 2519)) | ((2590 <= df['sic']) & (df['sic'] <= 2599)) | - ((2840 <= df['sic']) & (df['sic'] <= 2843)) | ((2844 <= df['sic']) & (df['sic'] <= 2844)) | - ((3160 <= df['sic']) & (df['sic'] <= 3161)) | ((3170 <= df['sic']) & (df['sic'] <= 3171)) | - ((3172 <= df['sic']) & (df['sic'] <= 3172)) | ((3190 <= df['sic']) & (df['sic'] <= 3199)) | - ((3229 <= df['sic']) & (df['sic'] <= 3229)) | ((3260 <= df['sic']) & (df['sic'] <= 3260)) | - ((3262 <= df['sic']) & (df['sic'] <= 3263)) | ((3269 <= df['sic']) & (df['sic'] <= 3269)) | - ((3230 <= df['sic']) & (df['sic'] <= 3231)) | ((3630 <= df['sic']) & (df['sic'] <= 3639)) | - ((3750 <= df['sic']) & (df['sic'] <= 3751)) | ((3800 <= df['sic']) & (df['sic'] <= 3800)) | - ((3860 <= df['sic']) & (df['sic'] <= 3861)) | ((3870 <= df['sic']) & (df['sic'] <= 3873)) | - ((3910 <= df['sic']) & (df['sic'] <= 3911)) | ((3914 <= df['sic']) & (df['sic'] <= 3914)) | - ((3915 <= df['sic']) & (df['sic'] <= 3915)) | ((3960 <= df['sic']) & (df['sic'] <= 3962)) | - ((3991 <= df['sic']) & (df['sic'] <= 3991)) | ((3995 <= df['sic']) & (df['sic'] <= 3995)), - ((2300 <= df['sic']) & (df['sic'] <= 2390)) | ((3020 <= df['sic']) & (df['sic'] <= 3021)) | - ((3100 <= df['sic']) & (df['sic'] <= 3111)) | ((3130 <= df['sic']) & (df['sic'] <= 3131)) | - ((3140 <= df['sic']) & (df['sic'] <= 3149)) | ((3150 <= df['sic']) & (df['sic'] <= 3151)) | - ((3963 <= df['sic']) & (df['sic'] <= 3965)), - ((8000 <= df['sic']) & (df['sic'] <= 8099)), - ((3693 <= df['sic']) & (df['sic'] <= 3693)) | ((3840 <= df['sic']) & (df['sic'] <= 3849)) | - ((3850 <= df['sic']) & (df['sic'] <= 3851)), - ((2830 <= df['sic']) & (df['sic'] <= 2830)) | ((2831 <= df['sic']) & (df['sic'] <= 2831)) | - ((2833 <= df['sic']) & (df['sic'] <= 2833)) | ((2834 <= df['sic']) & (df['sic'] <= 2834)) | - ((2835 <= df['sic']) & (df['sic'] <= 2835)) | ((2836 <= df['sic']) & (df['sic'] <= 2836)), - ((2800 <= df['sic']) & (df['sic'] <= 2809)) | ((2810 <= df['sic']) & (df['sic'] <= 2819)) | - ((2820 <= df['sic']) & (df['sic'] <= 2829)) | ((2850 <= df['sic']) & (df['sic'] <= 2859)) | - ((2860 <= df['sic']) & (df['sic'] <= 2869)) | ((2870 <= df['sic']) & (df['sic'] <= 2879)) | - ((2890 <= df['sic']) & (df['sic'] <= 2899)), - ((3031 <= df['sic']) & (df['sic'] <= 3031)) | ((3041 <= df['sic']) & (df['sic'] <= 3041)) | - ((3050 <= df['sic']) & (df['sic'] <= 3053)) | ((3060 <= df['sic']) & (df['sic'] <= 3069)) | - ((3070 <= df['sic']) & (df['sic'] <= 3079)) | ((3080 <= df['sic']) & (df['sic'] <= 3089)) | - ((3090 <= df['sic']) & (df['sic'] <= 3099)), - ((2200 <= df['sic']) & (df['sic'] <= 2269)) | ((2270 <= df['sic']) & (df['sic'] <= 2279)) | - ((2280 <= df['sic']) & (df['sic'] <= 2284)) | ((2290 <= df['sic']) & (df['sic'] <= 2295)) | - ((2297 <= df['sic']) & (df['sic'] <= 2297)) | ((2298 <= df['sic']) & (df['sic'] <= 2298)) | - ((2299 <= df['sic']) & (df['sic'] <= 2299)) | ((2393 <= df['sic']) & (df['sic'] <= 2395)) | - ((2397 <= df['sic']) & (df['sic'] <= 2399)), - ((800 <= df['sic']) & (df['sic'] <= 899)) | ((2400 <= df['sic']) & (df['sic'] <= 2439)) | - ((2450 <= df['sic']) & (df['sic'] <= 2459)) | ((2490 <= df['sic']) & (df['sic'] <= 2499)) | - ((2660 <= df['sic']) & (df['sic'] <= 2661)) | ((2950 <= df['sic']) & (df['sic'] <= 2952)) | - ((3200 <= df['sic']) & (df['sic'] <= 3200)) | ((3210 <= df['sic']) & (df['sic'] <= 3211)) | - ((3240 <= df['sic']) & (df['sic'] <= 3241)) | ((3250 <= df['sic']) & (df['sic'] <= 3259)) | - ((3261 <= df['sic']) & (df['sic'] <= 3261)) | ((3264 <= df['sic']) & (df['sic'] <= 3264)) | - ((3270 <= df['sic']) & (df['sic'] <= 3275)) | ((3280 <= df['sic']) & (df['sic'] <= 3281)) | - ((3290 <= df['sic']) & (df['sic'] <= 3293)) | ((3295 <= df['sic']) & (df['sic'] <= 3299)) | - ((3420 <= df['sic']) & (df['sic'] <= 3429)) | ((3430 <= df['sic']) & (df['sic'] <= 3433)) | - ((3440 <= df['sic']) & (df['sic'] <= 3441)) | ((3442 <= df['sic']) & (df['sic'] <= 3442)) | - ((3446 <= df['sic']) & (df['sic'] <= 3446)) | ((3448 <= df['sic']) & (df['sic'] <= 3448)) | - ((3449 <= df['sic']) & (df['sic'] <= 3449)) | ((3450 <= df['sic']) & (df['sic'] <= 3451)) | - ((3452 <= df['sic']) & (df['sic'] <= 3452)) | ((3490 <= df['sic']) & (df['sic'] <= 3499)) | - ((3996 <= df['sic']) & (df['sic'] <= 3996)), - ((1500 <= df['sic']) & (df['sic'] <= 1511)) | ((1520 <= df['sic']) & (df['sic'] <= 1529)) | - ((1530 <= df['sic']) & (df['sic'] <= 1539)) | ((1540 <= df['sic']) & (df['sic'] <= 1549)) | - ((1600 <= df['sic']) & (df['sic'] <= 1699)) | ((1700 <= df['sic']) & (df['sic'] <= 1799)), - ((3300 <= df['sic']) & (df['sic'] <= 3300)) | ((3310 <= df['sic']) & (df['sic'] <= 3317)) | - ((3320 <= df['sic']) & (df['sic'] <= 3325)) | ((3330 <= df['sic']) & (df['sic'] <= 3339)) | - ((3340 <= df['sic']) & (df['sic'] <= 3341)) | ((3350 <= df['sic']) & (df['sic'] <= 3357)) | - ((3360 <= df['sic']) & (df['sic'] <= 3369)) | ((3370 <= df['sic']) & (df['sic'] <= 3379)) | - ((3390 <= df['sic']) & (df['sic'] <= 3399)), - ((3400 <= df['sic']) & (df['sic'] <= 3400)) | ((3443 <= df['sic']) & (df['sic'] <= 3443)) | - ((3444 <= df['sic']) & (df['sic'] <= 3444)) | ((3460 <= df['sic']) & (df['sic'] <= 3469)) | - ((3470 <= df['sic']) & (df['sic'] <= 3479)), - ((3510 <= df['sic']) & (df['sic'] <= 3519)) | ((3520 <= df['sic']) & (df['sic'] <= 3529)) | - ((3530 <= df['sic']) & (df['sic'] <= 3530)) | ((3531 <= df['sic']) & (df['sic'] <= 3531)) | - ((3532 <= df['sic']) & (df['sic'] <= 3532)) | ((3533 <= df['sic']) & (df['sic'] <= 3533)) | - ((3534 <= df['sic']) & (df['sic'] <= 3534)) | ((3535 <= df['sic']) & (df['sic'] <= 3535)) | - ((3536 <= df['sic']) & (df['sic'] <= 3536)) | ((3538 <= df['sic']) & (df['sic'] <= 3538)) | - ((3540 <= df['sic']) & (df['sic'] <= 3549)) | ((3550 <= df['sic']) & (df['sic'] <= 3559)) | - ((3560 <= df['sic']) & (df['sic'] <= 3569)) | ((3580 <= df['sic']) & (df['sic'] <= 3580)) | - ((3581 <= df['sic']) & (df['sic'] <= 3581)) | ((3582 <= df['sic']) & (df['sic'] <= 3582)) | - ((3585 <= df['sic']) & (df['sic'] <= 3585)) | ((3586 <= df['sic']) & (df['sic'] <= 3586)) | - ((3589 <= df['sic']) & (df['sic'] <= 3589)) | ((3590 <= df['sic']) & (df['sic'] <= 3599)), - ((3600 <= df['sic']) & (df['sic'] <= 3600)) | ((3610 <= df['sic']) & (df['sic'] <= 3613)) | - ((3620 <= df['sic']) & (df['sic'] <= 3621)) | ((3623 <= df['sic']) & (df['sic'] <= 3629)) | - ((3640 <= df['sic']) & (df['sic'] <= 3644)) | ((3645 <= df['sic']) & (df['sic'] <= 3645)) | - ((3646 <= df['sic']) & (df['sic'] <= 3646)) | ((3648 <= df['sic']) & (df['sic'] <= 3649)) | - ((3660 <= df['sic']) & (df['sic'] <= 3660)) | ((3690 <= df['sic']) & (df['sic'] <= 3690)) | - ((3691 <= df['sic']) & (df['sic'] <= 3692)) | ((3699 <= df['sic']) & (df['sic'] <= 3699)), - ((2296 <= df['sic']) & (df['sic'] <= 2296)) | ((2396 <= df['sic']) & (df['sic'] <= 2396)) | - ((3010 <= df['sic']) & (df['sic'] <= 3011)) | ((3537 <= df['sic']) & (df['sic'] <= 3537)) | - ((3647 <= df['sic']) & (df['sic'] <= 3647)) | ((3694 <= df['sic']) & (df['sic'] <= 3694)) | - ((3700 <= df['sic']) & (df['sic'] <= 3700)) | ((3710 <= df['sic']) & (df['sic'] <= 3710)) | - ((3711 <= df['sic']) & (df['sic'] <= 3711)) | ((3713 <= df['sic']) & (df['sic'] <= 3713)) | - ((3714 <= df['sic']) & (df['sic'] <= 3714)) | ((3715 <= df['sic']) & (df['sic'] <= 3715)) | - ((3716 <= df['sic']) & (df['sic'] <= 3716)) | ((3792 <= df['sic']) & (df['sic'] <= 3792)) | - ((3790 <= df['sic']) & (df['sic'] <= 3791)) | ((3799 <= df['sic']) & (df['sic'] <= 3799)), - ((3720 <= df['sic']) & (df['sic'] <= 3720)) | ((3721 <= df['sic']) & (df['sic'] <= 3721)) | - ((3723 <= df['sic']) & (df['sic'] <= 3724)) | ((3725 <= df['sic']) & (df['sic'] <= 3725)) | - ((3728 <= df['sic']) & (df['sic'] <= 3729)), - ((3730 <= df['sic']) & (df['sic'] <= 3731)) | ((3740 <= df['sic']) & (df['sic'] <= 3743)), - ((3760 <= df['sic']) & (df['sic'] <= 3769)) | ((3795 <= df['sic']) & (df['sic'] <= 3795)) | - ((3480 <= df['sic']) & (df['sic'] <= 3489)), - ((1040 <= df['sic']) & (df['sic'] <= 1049)), - ((1000 <= df['sic']) & (df['sic'] <= 1009)) | ((1010 <= df['sic']) & (df['sic'] <= 1019)) | - ((1020 <= df['sic']) & (df['sic'] <= 1029)) | ((1030 <= df['sic']) & (df['sic'] <= 1039)) | - ((1050 <= df['sic']) & (df['sic'] <= 1059)) | ((1060 <= df['sic']) & (df['sic'] <= 1069)) | - ((1070 <= df['sic']) & (df['sic'] <= 1079)) | ((1080 <= df['sic']) & (df['sic'] <= 1089)) | - ((1090 <= df['sic']) & (df['sic'] <= 1099)) | ((1100 <= df['sic']) & (df['sic'] <= 1119)) | - ((1400 <= df['sic']) & (df['sic'] <= 1499)), - ((1200 <= df['sic']) & (df['sic'] <= 1299)), - ((1300 <= df['sic']) & (df['sic'] <= 1300)) | ((1310 <= df['sic']) & (df['sic'] <= 1319)) | - ((1320 <= df['sic']) & (df['sic'] <= 1329)) | ((1330 <= df['sic']) & (df['sic'] <= 1339)) | - ((1370 <= df['sic']) & (df['sic'] <= 1379)) | ((1380 <= df['sic']) & (df['sic'] <= 1380)) | - ((1381 <= df['sic']) & (df['sic'] <= 1381)) | ((1382 <= df['sic']) & (df['sic'] <= 1382)) | - ((1389 <= df['sic']) & (df['sic'] <= 1389)) | ((2900 <= df['sic']) & (df['sic'] <= 2912)) | - ((2990 <= df['sic']) & (df['sic'] <= 2999)), - ((4900 <= df['sic']) & (df['sic'] <= 4900)) | ((4910 <= df['sic']) & (df['sic'] <= 4911)) | - ((4920 <= df['sic']) & (df['sic'] <= 4922)) | ((4923 <= df['sic']) & (df['sic'] <= 4923)) | - ((4924 <= df['sic']) & (df['sic'] <= 4925)) | ((4930 <= df['sic']) & (df['sic'] <= 4931)) | - ((4932 <= df['sic']) & (df['sic'] <= 4932)) | ((4939 <= df['sic']) & (df['sic'] <= 4939)) | - ((4940 <= df['sic']) & (df['sic'] <= 4942)), - ((4800 <= df['sic']) & (df['sic'] <= 4800)) | ((4810 <= df['sic']) & (df['sic'] <= 4813)) | - ((4820 <= df['sic']) & (df['sic'] <= 4822)) | ((4830 <= df['sic']) & (df['sic'] <= 4839)) | - ((4840 <= df['sic']) & (df['sic'] <= 4841)) | ((4880 <= df['sic']) & (df['sic'] <= 4889)) | - ((4890 <= df['sic']) & (df['sic'] <= 4890)) | ((4891 <= df['sic']) & (df['sic'] <= 4891)) | - ((4892 <= df['sic']) & (df['sic'] <= 4892)) | ((4899 <= df['sic']) & (df['sic'] <= 4899)), - ((7020 <= df['sic']) & (df['sic'] <= 7021)) | ((7030 <= df['sic']) & (df['sic'] <= 7033)) | - ((7200 <= df['sic']) & (df['sic'] <= 7200)) | ((7210 <= df['sic']) & (df['sic'] <= 7212)) | - ((7214 <= df['sic']) & (df['sic'] <= 7214)) | ((7215 <= df['sic']) & (df['sic'] <= 7216)) | - ((7217 <= df['sic']) & (df['sic'] <= 7217)) | ((7219 <= df['sic']) & (df['sic'] <= 7219)) | - ((7220 <= df['sic']) & (df['sic'] <= 7221)) | ((7230 <= df['sic']) & (df['sic'] <= 7231)) | - ((7240 <= df['sic']) & (df['sic'] <= 7241)) | ((7250 <= df['sic']) & (df['sic'] <= 7251)) | - ((7260 <= df['sic']) & (df['sic'] <= 7269)) | ((7270 <= df['sic']) & (df['sic'] <= 7290)) | - ((7291 <= df['sic']) & (df['sic'] <= 7291)) | ((7292 <= df['sic']) & (df['sic'] <= 7299)) | - ((7395 <= df['sic']) & (df['sic'] <= 7395)) | ((7500 <= df['sic']) & (df['sic'] <= 7500)) | - ((7520 <= df['sic']) & (df['sic'] <= 7529)) | ((7530 <= df['sic']) & (df['sic'] <= 7539)) | - ((7540 <= df['sic']) & (df['sic'] <= 7549)) | ((7600 <= df['sic']) & (df['sic'] <= 7600)) | - ((7620 <= df['sic']) & (df['sic'] <= 7620)) | ((7622 <= df['sic']) & (df['sic'] <= 7622)) | - ((7623 <= df['sic']) & (df['sic'] <= 7623)) | ((7629 <= df['sic']) & (df['sic'] <= 7629)) | - ((7630 <= df['sic']) & (df['sic'] <= 7631)) | ((7640 <= df['sic']) & (df['sic'] <= 7641)) | - ((7690 <= df['sic']) & (df['sic'] <= 7699)) | ((8100 <= df['sic']) & (df['sic'] <= 8199)) | - ((8200 <= df['sic']) & (df['sic'] <= 8299)) | ((8300 <= df['sic']) & (df['sic'] <= 8399)) | - ((8400 <= df['sic']) & (df['sic'] <= 8499)) | ((8600 <= df['sic']) & (df['sic'] <= 8699)) | - ((8800 <= df['sic']) & (df['sic'] <= 8899)) | ((7510 <= df['sic']) & (df['sic'] <= 7515)), - ((2750 <= df['sic']) & (df['sic'] <= 2759)) | ((3993 <= df['sic']) & (df['sic'] <= 3993)) | - ((7218 <= df['sic']) & (df['sic'] <= 7218)) | ((7300 <= df['sic']) & (df['sic'] <= 7300)) | - ((7310 <= df['sic']) & (df['sic'] <= 7319)) | ((7320 <= df['sic']) & (df['sic'] <= 7329)) | - ((7330 <= df['sic']) & (df['sic'] <= 7339)) | ((7340 <= df['sic']) & (df['sic'] <= 7342)) | - ((7349 <= df['sic']) & (df['sic'] <= 7349)) | ((7350 <= df['sic']) & (df['sic'] <= 7351)) | - ((7352 <= df['sic']) & (df['sic'] <= 7352)) | ((7353 <= df['sic']) & (df['sic'] <= 7353)) | - ((7359 <= df['sic']) & (df['sic'] <= 7359)) | ((7360 <= df['sic']) & (df['sic'] <= 7369)) | - ((7374 <= df['sic']) & (df['sic'] <= 7374)) | ((7376 <= df['sic']) & (df['sic'] <= 7376)) | - ((7377 <= df['sic']) & (df['sic'] <= 7377)) | ((7378 <= df['sic']) & (df['sic'] <= 7378)) | - ((7379 <= df['sic']) & (df['sic'] <= 7379)) | ((7380 <= df['sic']) & (df['sic'] <= 7380)) | - ((7381 <= df['sic']) & (df['sic'] <= 7382)) | ((7383 <= df['sic']) & (df['sic'] <= 7383)) | - ((7384 <= df['sic']) & (df['sic'] <= 7384)) | ((7385 <= df['sic']) & (df['sic'] <= 7385)) | - ((7389 <= df['sic']) & (df['sic'] <= 7390)) | ((7391 <= df['sic']) & (df['sic'] <= 7391)) | - ((7392 <= df['sic']) & (df['sic'] <= 7392)) | ((7393 <= df['sic']) & (df['sic'] <= 7393)) | - ((7394 <= df['sic']) & (df['sic'] <= 7394)) | ((7396 <= df['sic']) & (df['sic'] <= 7396)) | - ((7397 <= df['sic']) & (df['sic'] <= 7397)) | ((7399 <= df['sic']) & (df['sic'] <= 7399)) | - ((7519 <= df['sic']) & (df['sic'] <= 7519)) | ((8700 <= df['sic']) & (df['sic'] <= 8700)) | - ((8710 <= df['sic']) & (df['sic'] <= 8713)) | ((8720 <= df['sic']) & (df['sic'] <= 8721)) | - ((8730 <= df['sic']) & (df['sic'] <= 8734)) | ((8740 <= df['sic']) & (df['sic'] <= 8748)) | - ((8900 <= df['sic']) & (df['sic'] <= 8910)) | ((8911 <= df['sic']) & (df['sic'] <= 8911)) | - ((8920 <= df['sic']) & (df['sic'] <= 8999)) | ((4220 <= df['sic']) & (df['sic'] <= 4229)), - ((3570 <= df['sic']) & (df['sic'] <= 3579)) | ((3680 <= df['sic']) & (df['sic'] <= 3680)) | - ((3681 <= df['sic']) & (df['sic'] <= 3681)) | ((3682 <= df['sic']) & (df['sic'] <= 3682)) | - ((3683 <= df['sic']) & (df['sic'] <= 3683)) | ((3684 <= df['sic']) & (df['sic'] <= 3684)) | - ((3685 <= df['sic']) & (df['sic'] <= 3685)) | ((3686 <= df['sic']) & (df['sic'] <= 3686)) | - ((3687 <= df['sic']) & (df['sic'] <= 3687)) | ((3688 <= df['sic']) & (df['sic'] <= 3688)) | - ((3689 <= df['sic']) & (df['sic'] <= 3689)) | ((3695 <= df['sic']) & (df['sic'] <= 3695)), - ((7370 <= df['sic']) & (df['sic'] <= 7372)) | ((7375 <= df['sic']) & (df['sic'] <= 7375)) | - ((7373 <= df['sic']) & (df['sic'] <= 7373)), - ((3622 <= df['sic']) & (df['sic'] <= 3622)) | ((3661 <= df['sic']) & (df['sic'] <= 3661)) | - ((3662 <= df['sic']) & (df['sic'] <= 3662)) | ((3663 <= df['sic']) & (df['sic'] <= 3663)) | - ((3664 <= df['sic']) & (df['sic'] <= 3664)) | ((3665 <= df['sic']) & (df['sic'] <= 3665)) | - ((3666 <= df['sic']) & (df['sic'] <= 3666)) | ((3669 <= df['sic']) & (df['sic'] <= 3669)) | - ((3670 <= df['sic']) & (df['sic'] <= 3679)) | ((3810 <= df['sic']) & (df['sic'] <= 3810)) | - ((3812 <= df['sic']) & (df['sic'] <= 3812)), - ((3811 <= df['sic']) & (df['sic'] <= 3811)) | ((3820 <= df['sic']) & (df['sic'] <= 3820)) | - ((3821 <= df['sic']) & (df['sic'] <= 3821)) | ((3822 <= df['sic']) & (df['sic'] <= 3822)) | - ((3823 <= df['sic']) & (df['sic'] <= 3823)) | ((3824 <= df['sic']) & (df['sic'] <= 3824)) | - ((3825 <= df['sic']) & (df['sic'] <= 3825)) | ((3826 <= df['sic']) & (df['sic'] <= 3826)) | - ((3827 <= df['sic']) & (df['sic'] <= 3827)) | ((3829 <= df['sic']) & (df['sic'] <= 3829)) | - ((3830 <= df['sic']) & (df['sic'] <= 3839)), - ((2520 <= df['sic']) & (df['sic'] <= 2549)) | ((2600 <= df['sic']) & (df['sic'] <= 2639)) | - ((2670 <= df['sic']) & (df['sic'] <= 2699)) | ((2760 <= df['sic']) & (df['sic'] <= 2761)) | - ((3950 <= df['sic']) & (df['sic'] <= 3955)), - ((2440 <= df['sic']) & (df['sic'] <= 2449)) | ((2640 <= df['sic']) & (df['sic'] <= 2659)) | - ((3220 <= df['sic']) & (df['sic'] <= 3221)) | ((3410 <= df['sic']) & (df['sic'] <= 3412)), - ((4000 <= df['sic']) & (df['sic'] <= 4013)) | ((4040 <= df['sic']) & (df['sic'] <= 4049)) | - ((4100 <= df['sic']) & (df['sic'] <= 4100)) | ((4110 <= df['sic']) & (df['sic'] <= 4119)) | - ((4120 <= df['sic']) & (df['sic'] <= 4121)) | ((4130 <= df['sic']) & (df['sic'] <= 4131)) | - ((4140 <= df['sic']) & (df['sic'] <= 4142)) | ((4150 <= df['sic']) & (df['sic'] <= 4151)) | - ((4170 <= df['sic']) & (df['sic'] <= 4173)) | ((4190 <= df['sic']) & (df['sic'] <= 4199)) | - ((4200 <= df['sic']) & (df['sic'] <= 4200)) | ((4210 <= df['sic']) & (df['sic'] <= 4219)) | - ((4230 <= df['sic']) & (df['sic'] <= 4231)) | ((4240 <= df['sic']) & (df['sic'] <= 4249)) | - ((4400 <= df['sic']) & (df['sic'] <= 4499)) | ((4500 <= df['sic']) & (df['sic'] <= 4599)) | - ((4600 <= df['sic']) & (df['sic'] <= 4699)) | ((4700 <= df['sic']) & (df['sic'] <= 4700)) | - ((4710 <= df['sic']) & (df['sic'] <= 4712)) | ((4720 <= df['sic']) & (df['sic'] <= 4729)) | - ((4730 <= df['sic']) & (df['sic'] <= 4739)) | ((4740 <= df['sic']) & (df['sic'] <= 4749)) | - ((4780 <= df['sic']) & (df['sic'] <= 4780)) | ((4782 <= df['sic']) & (df['sic'] <= 4782)) | - ((4783 <= df['sic']) & (df['sic'] <= 4783)) | ((4784 <= df['sic']) & (df['sic'] <= 4784)) | - ((4785 <= df['sic']) & (df['sic'] <= 4785)) | ((4789 <= df['sic']) & (df['sic'] <= 4789)), - ((5000 <= df['sic']) & (df['sic'] <= 5000)) | ((5010 <= df['sic']) & (df['sic'] <= 5015)) | - ((5020 <= df['sic']) & (df['sic'] <= 5023)) | ((5030 <= df['sic']) & (df['sic'] <= 5039)) | - ((5040 <= df['sic']) & (df['sic'] <= 5042)) | ((5043 <= df['sic']) & (df['sic'] <= 5043)) | - ((5044 <= df['sic']) & (df['sic'] <= 5044)) | ((5045 <= df['sic']) & (df['sic'] <= 5045)) | - ((5046 <= df['sic']) & (df['sic'] <= 5046)) | ((5047 <= df['sic']) & (df['sic'] <= 5047)) | - ((5048 <= df['sic']) & (df['sic'] <= 5048)) | ((5049 <= df['sic']) & (df['sic'] <= 5049)) | - ((5050 <= df['sic']) & (df['sic'] <= 5059)) | ((5060 <= df['sic']) & (df['sic'] <= 5060)) | - ((5063 <= df['sic']) & (df['sic'] <= 5063)) | ((5064 <= df['sic']) & (df['sic'] <= 5064)) | - ((5065 <= df['sic']) & (df['sic'] <= 5065)) | ((5070 <= df['sic']) & (df['sic'] <= 5078)) | - ((5080 <= df['sic']) & (df['sic'] <= 5080)) | ((5081 <= df['sic']) & (df['sic'] <= 5081)) | - ((5082 <= df['sic']) & (df['sic'] <= 5082)) | ((5083 <= df['sic']) & (df['sic'] <= 5083)) | - ((5084 <= df['sic']) & (df['sic'] <= 5084)) | ((5085 <= df['sic']) & (df['sic'] <= 5085)) | - ((5086 <= df['sic']) & (df['sic'] <= 5087)) | ((5088 <= df['sic']) & (df['sic'] <= 5088)) | - ((5090 <= df['sic']) & (df['sic'] <= 5090)) | ((5091 <= df['sic']) & (df['sic'] <= 5092)) | - ((5093 <= df['sic']) & (df['sic'] <= 5093)) | ((5094 <= df['sic']) & (df['sic'] <= 5094)) | - ((5099 <= df['sic']) & (df['sic'] <= 5099)) | ((5100 <= df['sic']) & (df['sic'] <= 5100)) | - ((5110 <= df['sic']) & (df['sic'] <= 5113)) | ((5120 <= df['sic']) & (df['sic'] <= 5122)) | - ((5130 <= df['sic']) & (df['sic'] <= 5139)) | ((5140 <= df['sic']) & (df['sic'] <= 5149)) | - ((5150 <= df['sic']) & (df['sic'] <= 5159)) | ((5160 <= df['sic']) & (df['sic'] <= 5169)) | - ((5170 <= df['sic']) & (df['sic'] <= 5172)) | ((5180 <= df['sic']) & (df['sic'] <= 5182)) | - ((5190 <= df['sic']) & (df['sic'] <= 5199)), - ((5200 <= df['sic']) & (df['sic'] <= 5200)) | ((5210 <= df['sic']) & (df['sic'] <= 5219)) | - ((5220 <= df['sic']) & (df['sic'] <= 5229)) | ((5230 <= df['sic']) & (df['sic'] <= 5231)) | - ((5250 <= df['sic']) & (df['sic'] <= 5251)) | ((5260 <= df['sic']) & (df['sic'] <= 5261)) | - ((5270 <= df['sic']) & (df['sic'] <= 5271)) | ((5300 <= df['sic']) & (df['sic'] <= 5300)) | - ((5310 <= df['sic']) & (df['sic'] <= 5311)) | ((5320 <= df['sic']) & (df['sic'] <= 5320)) | - ((5330 <= df['sic']) & (df['sic'] <= 5331)) | ((5334 <= df['sic']) & (df['sic'] <= 5334)) | - ((5340 <= df['sic']) & (df['sic'] <= 5349)) | ((5390 <= df['sic']) & (df['sic'] <= 5399)) | - ((5400 <= df['sic']) & (df['sic'] <= 5400)) | ((5410 <= df['sic']) & (df['sic'] <= 5411)) | - ((5412 <= df['sic']) & (df['sic'] <= 5412)) | ((5420 <= df['sic']) & (df['sic'] <= 5429)) | - ((5430 <= df['sic']) & (df['sic'] <= 5439)) | ((5440 <= df['sic']) & (df['sic'] <= 5449)) | - ((5450 <= df['sic']) & (df['sic'] <= 5459)) | ((5460 <= df['sic']) & (df['sic'] <= 5469)) | - ((5490 <= df['sic']) & (df['sic'] <= 5499)) | ((5500 <= df['sic']) & (df['sic'] <= 5500)) | - ((5510 <= df['sic']) & (df['sic'] <= 5529)) | ((5530 <= df['sic']) & (df['sic'] <= 5539)) | - ((5540 <= df['sic']) & (df['sic'] <= 5549)) | ((5550 <= df['sic']) & (df['sic'] <= 5559)) | - ((5560 <= df['sic']) & (df['sic'] <= 5569)) | ((5570 <= df['sic']) & (df['sic'] <= 5579)) | - ((5590 <= df['sic']) & (df['sic'] <= 5599)) | ((5600 <= df['sic']) & (df['sic'] <= 5699)) | - ((5700 <= df['sic']) & (df['sic'] <= 5700)) | ((5710 <= df['sic']) & (df['sic'] <= 5719)) | - ((5720 <= df['sic']) & (df['sic'] <= 5722)) | ((5730 <= df['sic']) & (df['sic'] <= 5733)) | - ((5734 <= df['sic']) & (df['sic'] <= 5734)) | ((5735 <= df['sic']) & (df['sic'] <= 5735)) | - ((5736 <= df['sic']) & (df['sic'] <= 5736)) | ((5750 <= df['sic']) & (df['sic'] <= 5799)) | - ((5900 <= df['sic']) & (df['sic'] <= 5900)) | ((5910 <= df['sic']) & (df['sic'] <= 5912)) | - ((5920 <= df['sic']) & (df['sic'] <= 5929)) | ((5930 <= df['sic']) & (df['sic'] <= 5932)) | - ((5940 <= df['sic']) & (df['sic'] <= 5940)) | ((5941 <= df['sic']) & (df['sic'] <= 5941)) | - ((5942 <= df['sic']) & (df['sic'] <= 5942)) | ((5943 <= df['sic']) & (df['sic'] <= 5943)) | - ((5944 <= df['sic']) & (df['sic'] <= 5944)) | ((5945 <= df['sic']) & (df['sic'] <= 5945)) | - ((5946 <= df['sic']) & (df['sic'] <= 5946)) | ((5947 <= df['sic']) & (df['sic'] <= 5947)) | - ((5948 <= df['sic']) & (df['sic'] <= 5948)) | ((5949 <= df['sic']) & (df['sic'] <= 5949)) | - ((5950 <= df['sic']) & (df['sic'] <= 5959)) | ((5960 <= df['sic']) & (df['sic'] <= 5969)) | - ((5970 <= df['sic']) & (df['sic'] <= 5979)) | ((5980 <= df['sic']) & (df['sic'] <= 5989)) | - ((5990 <= df['sic']) & (df['sic'] <= 5990)) | ((5992 <= df['sic']) & (df['sic'] <= 5992)) | - ((5993 <= df['sic']) & (df['sic'] <= 5993)) | ((5994 <= df['sic']) & (df['sic'] <= 5994)) | - ((5995 <= df['sic']) & (df['sic'] <= 5995)) | ((5999 <= df['sic']) & (df['sic'] <= 5999)), - ((5800 <= df['sic']) & (df['sic'] <= 5819)) | ((5820 <= df['sic']) & (df['sic'] <= 5829)) | - ((5890 <= df['sic']) & (df['sic'] <= 5899)) | ((7000 <= df['sic']) & (df['sic'] <= 7000)) | - ((7010 <= df['sic']) & (df['sic'] <= 7019)) | ((7040 <= df['sic']) & (df['sic'] <= 7049)) | - ((7213 <= df['sic']) & (df['sic'] <= 7213)), - ((6000 <= df['sic']) & (df['sic'] <= 6000)) | ((6010 <= df['sic']) & (df['sic'] <= 6019)) | - ((6020 <= df['sic']) & (df['sic'] <= 6020)) | ((6021 <= df['sic']) & (df['sic'] <= 6021)) | - ((6022 <= df['sic']) & (df['sic'] <= 6022)) | ((6023 <= df['sic']) & (df['sic'] <= 6024)) | - ((6025 <= df['sic']) & (df['sic'] <= 6025)) | ((6026 <= df['sic']) & (df['sic'] <= 6026)) | - ((6027 <= df['sic']) & (df['sic'] <= 6027)) | ((6028 <= df['sic']) & (df['sic'] <= 6029)) | - ((6030 <= df['sic']) & (df['sic'] <= 6036)) | ((6040 <= df['sic']) & (df['sic'] <= 6059)) | - ((6060 <= df['sic']) & (df['sic'] <= 6062)) | ((6080 <= df['sic']) & (df['sic'] <= 6082)) | - ((6090 <= df['sic']) & (df['sic'] <= 6099)) | ((6100 <= df['sic']) & (df['sic'] <= 6100)) | - ((6110 <= df['sic']) & (df['sic'] <= 6111)) | ((6112 <= df['sic']) & (df['sic'] <= 6113)) | - ((6120 <= df['sic']) & (df['sic'] <= 6129)) | ((6130 <= df['sic']) & (df['sic'] <= 6139)) | - ((6140 <= df['sic']) & (df['sic'] <= 6149)) | ((6150 <= df['sic']) & (df['sic'] <= 6159)) | - ((6160 <= df['sic']) & (df['sic'] <= 6169)) | ((6170 <= df['sic']) & (df['sic'] <= 6179)) | - ((6190 <= df['sic']) & (df['sic'] <= 6199)), - ((6300 <= df['sic']) & (df['sic'] <= 6300)) | ((6310 <= df['sic']) & (df['sic'] <= 6319)) | - ((6320 <= df['sic']) & (df['sic'] <= 6329)) | ((6330 <= df['sic']) & (df['sic'] <= 6331)) | - ((6350 <= df['sic']) & (df['sic'] <= 6351)) | ((6360 <= df['sic']) & (df['sic'] <= 6361)) | - ((6370 <= df['sic']) & (df['sic'] <= 6379)) | ((6390 <= df['sic']) & (df['sic'] <= 6399)) | - ((6400 <= df['sic']) & (df['sic'] <= 6411)), - ((6500 <= df['sic']) & (df['sic'] <= 6500)) | ((6510 <= df['sic']) & (df['sic'] <= 6510)) | - ((6512 <= df['sic']) & (df['sic'] <= 6512)) | ((6513 <= df['sic']) & (df['sic'] <= 6513)) | - ((6514 <= df['sic']) & (df['sic'] <= 6514)) | ((6515 <= df['sic']) & (df['sic'] <= 6515)) | - ((6517 <= df['sic']) & (df['sic'] <= 6519)) | ((6520 <= df['sic']) & (df['sic'] <= 6529)) | - ((6530 <= df['sic']) & (df['sic'] <= 6531)) | ((6532 <= df['sic']) & (df['sic'] <= 6532)) | - ((6540 <= df['sic']) & (df['sic'] <= 6541)) | ((6550 <= df['sic']) & (df['sic'] <= 6553)) | - ((6590 <= df['sic']) & (df['sic'] <= 6599)) | ((6610 <= df['sic']) & (df['sic'] <= 6611)), - ((6200 <= df['sic']) & (df['sic'] <= 6299)) | ((6700 <= df['sic']) & (df['sic'] <= 6700)) | - ((6710 <= df['sic']) & (df['sic'] <= 6719)) | ((6720 <= df['sic']) & (df['sic'] <= 6722)) | - ((6723 <= df['sic']) & (df['sic'] <= 6723)) | ((6724 <= df['sic']) & (df['sic'] <= 6724)) | - ((6725 <= df['sic']) & (df['sic'] <= 6725)) | ((6726 <= df['sic']) & (df['sic'] <= 6726)) | - ((6730 <= df['sic']) & (df['sic'] <= 6733)) | ((6740 <= df['sic']) & (df['sic'] <= 6779)) | - ((6790 <= df['sic']) & (df['sic'] <= 6791)) | ((6792 <= df['sic']) & (df['sic'] <= 6792)) | - ((6793 <= df['sic']) & (df['sic'] <= 6793)) | ((6794 <= df['sic']) & (df['sic'] <= 6794)) | - ((6795 <= df['sic']) & (df['sic'] <= 6795)) | ((6798 <= df['sic']) & (df['sic'] <= 6798)) | - ((6799 <= df['sic']) & (df['sic'] <= 6799)), - ((4950 <= df['sic']) & (df['sic'] <= 4959)) | ((4960 <= df['sic']) & (df['sic'] <= 4961)) | - ((4970 <= df['sic']) & (df['sic'] <= 4971)) | ((4990 <= df['sic']) & (df['sic'] <= 4991))] - choicelist = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, - 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] - return np.select(condlist, choicelist, default=np.nan) - - -def fillna_atq(df_q, df_a): - # fina columns are na in df_q and exist in df_a - df_q_na_list = df_q.columns[df_q.isna().any()].tolist() - df_a_columns_list = df_a.columns.values.tolist() - list_temp = list(set(df_q_na_list) & set(df_a_columns_list)) - # remove mom columns, mom chars are same in annual and quarterly - na_columns_list = [] - for i in list_temp: - if re.match(r'mom.', i) is None: - na_columns_list.append(i) - # get annual columns from df_a - df_temp = df_a[na_columns_list].copy() - df_temp[['permno', 'jdate']] = df_a[['permno', 'jdate']].copy() - # rename annual columns in the form of 'chars_a' - for na_column in na_columns_list: - df_temp = df_temp.rename(columns={'%s' % na_column: '%s_a' % na_column}) - df_temp = df_temp.reset_index(drop=True) - # use annual chars to fill quarterly na - df_q = pd.merge(df_q, df_temp, how='left', on=['permno', 'jdate']) - for na_column in na_columns_list: - df_q['%s' % na_column] = np.where(df_q['%s' % na_column].isnull(), df_q['%s_a' % na_column], df_q['%s' % na_column]) - df_q = df_q.drop(['%s_a' % na_column], axis=1) - return df_q - - -def fillna_ind(df, method, ffi): - df_fill = pd.DataFrame() - na_columns_list = df.columns[df.isna().any()].tolist() - for na_column in na_columns_list: - if method == 'mean': - df_temp = df.groupby(['jdate', 'ffi%s' % ffi])['%s' % na_column].mean() - elif method == 'median': - df_temp = df.groupby(['jdate', 'ffi%s' % ffi])['%s' % na_column].median() - else: - None - df_fill = pd.concat([df_fill, df_temp], axis=1) - if method == 'mean': - df_fill = df_fill.rename(columns={'%s' % na_column: '%s_mean' % na_column}) - elif method == 'median': - df_fill = df_fill.rename(columns={'%s' % na_column: '%s_median' % na_column}) - else: - None - df_fill = df_fill.reset_index() - # reset multiple index to jdate and ffi code - df_fill['index'] = df_fill['index'].astype(str) - index_temp = df_fill['index'].str.split(',', expand=True) - index_temp.columns = ['jdate', 'ffi%s' % ffi] - index_temp['jdate'] = index_temp['jdate'].str.strip('(Timestamp(\' \')') - index_temp['ffi%s' % ffi] = index_temp['ffi%s' % ffi].str.strip(')') - df_fill[['jdate', 'ffi%s' % ffi]] = index_temp[['jdate', 'ffi%s' % ffi]] - df_fill = df_fill.drop(['index'], axis=1) - df_fill['jdate'] = pd.to_datetime(df_fill['jdate']) - df_fill['ffi49'] = df_fill['ffi49'].astype(int) - # fill na - df = pd.merge(df, df_fill, how='left', on=['jdate', 'ffi%s' % ffi]) - for na_column in na_columns_list: - if method == 'mean': - df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column]) - df = df.drop(['%s_mean' % na_column], axis=1) - elif method == 'median': - df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_median' % na_column]) - df = df.drop(['%s_median' % na_column], axis=1) - else: - None - return df - - -def fillna_all(df, method): - df_fill = pd.DataFrame() - na_columns_list = df.columns[df.isna().any()].tolist() - for na_column in na_columns_list: - if method == 'mean': - df_temp = df.groupby(['jdate'])['%s' % na_column].mean() - elif method == 'median': - df_temp = df.groupby(['jdate'])['%s' % na_column].median() - else: - None - df_fill = pd.concat([df_fill, df_temp], axis=1) - if method == 'mean': - df_fill = df_fill.rename(columns={'%s' % na_column: '%s_mean' % na_column}) - elif method == 'median': - df_fill = df_fill.rename(columns={'%s' % na_column: '%s_median' % na_column}) - else: - None - df_fill = df_fill.reset_index() - # reset multiple index to jdate and ffi code - df_fill['index'] = df_fill['index'].astype(str) - index_temp = df_fill['index'].str.split(',', expand=True) - index_temp.columns = ['jdate'] - index_temp['jdate'] = index_temp['jdate'].str.strip('(Timestamp(\' \')') - df_fill[['jdate']] = index_temp[['jdate']] - df_fill = df_fill.drop(['index'], axis=1) - df_fill['jdate'] = pd.to_datetime(df_fill['jdate']) - # fill na - df = pd.merge(df, df_fill, how='left', on='jdate') - for na_column in na_columns_list: - if method == 'mean': - df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column]) - df = df.drop(['%s_mean' % na_column], axis=1) - elif method == 'median': - df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_median' % na_column]) - df = df.drop(['%s_median' % na_column], axis=1) - else: - None - return df - - -def standardize(df): - df_temp = df.groupby(['jdate'], as_index=False)['gvkey'].count() - df_temp = df_temp.rename(columns={'gvkey': 'count'}) - df = pd.merge(df, df_temp, how='left', on='jdate') - col_names = df.columns.values.tolist() - list_to_remove = ['permno', 'date', 'jdate', 'datadate', 'gvkey', 'sic', 'count', 'exchcd', 'shrcd'] - col_names = list(set(col_names).difference(set(list_to_remove))) - df = df.fillna(0) - for col_name in col_names: - df['%s_rank' % col_name] = df.groupby(['jdate'])['%s' % col_name].rank() - df['rank_%s' % col_name] = (df['%s_rank' % col_name]-1)/(df['count']-1)*2 - 1 - df = df.drop(['%s_rank' % col_name, '%s' % col_name], axis=1) - return df \ No newline at end of file diff --git a/pychars/hxz_abr.py b/pychars/hxz_abr.py deleted file mode 100755 index ecb5219..0000000 --- a/pychars/hxz_abr.py +++ /dev/null @@ -1,236 +0,0 @@ -# Calculate HSZ Replicating Anomalies -# ABR: Cumulative abnormal stock returns around earnings announcements - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import pickle as pkl -import sqlite3 - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -################### -# Compustat Block # -################### -comp = conn.raw_sql(""" - select gvkey, datadate, rdq, fyearq, fqtr - from comp.fundq - where indfmt = 'INDL' - and datafmt = 'STD' - and popsrc = 'D' - and consol = 'C' - and datadate >= '01/01/1959' - """) - -comp['datadate'] = pd.to_datetime(comp['datadate']) - -print('='*10, 'comp data is ready', '='*10) -################### -# CCM Block # -################### -ccm = conn.raw_sql(""" - select gvkey, lpermno as permno, linktype, linkprim, - linkdt, linkenddt - from crsp.ccmxpf_linktable - where linktype in ('LU', 'LC') - """) - -ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) -ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) - -# if linkenddt is missing then set to today date -ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) - -ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) -# extract month and year of rdq -ccm1['rdq'] = pd.to_datetime(ccm1['rdq']) - -# set link date bounds -ccm2 = ccm1[(ccm1['datadate']>=ccm1['linkdt']) & (ccm1['datadate']<=ccm1['linkenddt'])] -ccm2 = ccm2[['gvkey', 'datadate', 'rdq', 'fyearq', 'fqtr', 'permno']] - -################### -# CRSP Block # -################### - -# Report Date of Quarterly Earnings (rdq) may not be trading day, we need to get the first trading day on or after rdq -crsp_dsi = conn.raw_sql(""" - select distinct date - from crsp.dsi - where date >= '01/01/1959' - """) - -crsp_dsi['date'] = pd.to_datetime(crsp_dsi['date']) - -for i in range(6): # we only consider the condition that the day after rdq is not a trading day, which is up to 5 days - ccm2['trad_%s' % i] = ccm2['rdq'] + pd.DateOffset(days=i) # set rdq + i days to match trading day - crsp_dsi['trad_%s' % i] = crsp_dsi['date'] # set the merging key - crsp_dsi = crsp_dsi[['date', 'trad_%s' % i]] # reset trading day columns to avoid repeat merge - comp_temp = pd.merge(ccm2, crsp_dsi, how='left', on='trad_%s' % i) - comp_temp['trad_%s' % i] = comp_temp['date'] # reset rdq + i days to matched trading day - -# fill NA from rdq + 5 days to rdq + 0 days, then get trading day version of rdq -for i in range(5, 0, -1): - count = i-1 - comp_temp['trad_%s' % count] = np.where(comp_temp['trad_%s' % count].isnull(), - comp_temp['trad_%s' % i], comp_temp['trad_%s' % count]) - comp_temp['rdq_trad'] = comp_temp['trad_%s' % count] - -comp_temp = comp_temp[['gvkey', 'permno', 'datadate', 'fyearq', 'fqtr', 'rdq', 'rdq_trad']] - -print('='*10, 'crsp block is ready', '='*10) -############################# -# CRSP abnormal return # -############################# -crsp_d = conn.raw_sql(""" - select a.prc, a.ret, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.permno, a.permco, a.date, - b.siccd, b.ncusip, b.shrcd, b.exchcd - from crsp.dsf as a - left join crsp.dsenames as b - on a.permno=b.permno - and b.namedt<=a.date - and a.date<=b.nameendt - where a.date >= '01/01/1959' - and b.exchcd between 1 and 3 - and b.shrcd in (10,11) - """) - -# change variable format to int -crsp_d[['permco', 'permno', 'shrcd', 'exchcd']] = crsp_d[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) - -print('='*10, 'crsp abnormal return is ready', '='*10) - -# convert the date format -crsp_d['date'] = pd.to_datetime(crsp_d['date']) - -# add delisting return -dlret = conn.raw_sql(""" - select permno, dlret, dlstdt - from crsp.dsedelist - where dlstdt >= '01/01/1959' - """) - -dlret.permno = dlret.permno.astype(int) -dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) - -crsp_d = pd.merge(crsp_d, dlret, how='left', left_on=['permno', 'date'], right_on=['permno', 'dlstdt']) -# return adjusted for delisting -crsp_d['retadj'] = np.where(crsp_d['dlret'].notna(), (crsp_d['ret'] + 1)*(crsp_d['dlret'] + 1) - 1, crsp_d['ret']) -crsp_d['meq'] = crsp_d['prc'].abs()*crsp_d['shrout'] # market value of equity -crsp_d = crsp_d.sort_values(by=['date', 'permno', 'meq']) - -# sprtrn -crspsp500d = conn.raw_sql(""" - select date, sprtrn - from crsp.dsi - where date >= '01/01/1959' - """) - -crspsp500d['date'] = pd.to_datetime(crspsp500d['date']) - -# abnormal return -crsp_d = pd.merge(crsp_d, crspsp500d, how='left', on='date') -crsp_d['abrd'] = crsp_d['retadj'] - crsp_d['sprtrn'] -crsp_d = crsp_d[['date', 'permno', 'ret', 'retadj', 'sprtrn', 'abrd']] - -# date count regarding to rdq -comp_temp['minus10d'] = comp_temp['rdq_trad'] - pd.Timedelta(days=10) -comp_temp['plus5d'] = comp_temp['rdq_trad'] + pd.Timedelta(days=5) - -# df = sqldf("""select a.*, b.date, b.abrd -# from comp_temp a left join crsp_d b -# on a.permno=b.permno -# and a.minus10d<=b.date -# and b.date<=a.plus5d -# order by a.permno, a.rdq_trad, b.date;""", globals()) - -sql = sqlite3.connect(':memory:') -comp_temp.to_sql('comp_temp', sql, index=False) -crsp_d.to_sql('crsp_d', sql, index=False) - -qry = """select a.*, b.date, b.abrd - from comp_temp a left join crsp_d b - on a.permno=b.permno - and a.minus10d<=b.date - and b.date<=a.plus5d - order by a.permno, a.rdq_trad, b.date;""" -df = pd.read_sql_query(qry, sql) -df.drop(['plus5d', 'minus10d'], axis=1, inplace=True) - -# delete missing return -df = df[df['abrd'].notna()] - -# count -df.sort_values(by=['permno', 'rdq_trad', 'date'], inplace=True) -condlist = [df['date']==df['rdq_trad'], - df['date']>df['rdq_trad'], - df['date']=0] -df_after['count'] = df_after.groupby(['permno', 'rdq_trad'])['date'].cumcount() - -df = pd.concat([df_before, df_after]) - -# calculate abr as the group sum -df = df[(df['count']>=-2) & (df['count']<=1)] - -df_temp = df.groupby(['permno', 'rdq_trad'])['abrd'].sum() -df_temp = pd.DataFrame(df_temp) -df_temp.reset_index(inplace=True) -df_temp.rename(columns={'abrd': 'abr'}, inplace=True) -df = pd.merge(df, df_temp, how='left', on=['permno', 'rdq_trad'], copy=False) # add abr back to df -df = df[df['count']==1] -df.rename(columns={'date': 'rdq_plus_1d'}, inplace=True) -df = df[['gvkey', 'permno', 'datadate', 'rdq', 'rdq_plus_1d', 'abr']] - -print('='*10, 'start populate', '='*10) - -# populate the quarterly abr to monthly -crsp_msf = conn.raw_sql(""" - select distinct date - from crsp.msf - where date >= '01/01/1959' - """) - -df['datadate'] = pd.to_datetime(df['datadate']) -df['plus12m'] = df['datadate'] + np.timedelta64(12, 'M') -df['plus12m'] = df['plus12m'] + MonthEnd(0) - -# df = sqldf("""select a.*, b.date -# from df a left join crsp_msf b -# on a.rdq_plus_1d < b.date -# and a.plus12m >= b.date -# order by a.permno, b.date, a.datadate desc;""", globals()) - -df.to_sql('df', sql, index=False) -crsp_msf.to_sql('crsp_msf', sql, index=False) - -qry = """select a.*, b.date - from df a left join crsp_msf b - on a.rdq_plus_1d < b.date - and a.plus12m >= b.date - order by a.permno, b.date, a.datadate desc;""" - -df = pd.read_sql_query(qry, sql) - -df = df.drop_duplicates(['permno', 'date']) -df['datadate'] = pd.to_datetime(df['datadate']) -df['rdq'] = pd.to_datetime(df['rdq']) -df['rdq_plus_1d'] = pd.to_datetime(df['rdq_plus_1d']) -df = df[['gvkey', 'permno', 'datadate', 'rdq', 'rdq_plus_1d', 'abr', 'date']] - -with open('abr.pkl', 'wb') as f: - pkl.dump(df, f) \ No newline at end of file diff --git a/pychars/hxz_re.py b/pychars/hxz_re.py deleted file mode 100755 index 7dab02f..0000000 --- a/pychars/hxz_re.py +++ /dev/null @@ -1,120 +0,0 @@ -# Calculate HSZ Replicating Anomalies -# RE: Revisions in analysts’ earnings forecasts - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -from pandasql import * -import pickle as pkl - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -######################################################################### -# Merging IBES and CRSP by using ICLINK table. Merging last month price # -######################################################################### - -with open('iclink.pkl', 'rb')as f: - iclink = pkl.load(f) - -ibes = conn.raw_sql(""" - select - ticker, statpers, meanest, fpedats, anndats_act, curr_act, fpi, medest - from ibes.statsum_epsus - where - /* filtering IBES */ - statpers=0 - and CURCODE='USD' - and fpi in ('1','2')""") - -# filtering IBES -ibes = ibes[(ibes['medest'].notna()) & (ibes['fpedats'].notna())] -ibes = ibes[(ibes['curr_act']=='USD') | (ibes['curr_act'].isnull())] -ibes['statpers'] = pd.to_datetime(ibes['statpers']) -ibes['merge_date'] = ibes['statpers']+MonthEnd(0) - -crsp_msf = conn.raw_sql(""" - select permno, date, prc, cfacpr - from crsp.msf - """) - -crsp_msf['date'] = pd.to_datetime(crsp_msf['date']) -crsp_msf['date'] = crsp_msf['date']+MonthEnd(0) -crsp_msf['merge_date'] = crsp_msf['date']+MonthEnd(1) - -ibes_iclink = pd.merge(ibes, iclink, how='left', on='ticker') -ibes_crsp = pd.merge(ibes_iclink, crsp_msf, how='inner', on=['permno', 'merge_date']) -ibes_crsp.sort_values(by=['ticker', 'fpedats', 'statpers'], inplace=True) -ibes_crsp.reset_index(inplace=True, drop=True) - -############################### -# Merging last month forecast # -############################### -ibes_crsp['statpers_last_month'] = np.where((ibes_crsp['ticker'] == ibes_crsp['ticker'].shift(1)) & - (ibes_crsp['permno'] == ibes_crsp['permno'].shift(1)) & - (ibes_crsp['fpedats'] == ibes_crsp['fpedats'].shift(1)), - ibes_crsp['statpers'].shift(1).astype(str), np.nan) - -ibes_crsp['meanest_last_month'] = np.where((ibes_crsp['ticker'] == ibes_crsp['ticker'].shift(1)) & - (ibes_crsp['permno'] == ibes_crsp['permno'].shift(1)) & - (ibes_crsp['fpedats'] == ibes_crsp['fpedats'].shift(1)), - ibes_crsp['meanest'].shift(1), np.nan) - -ibes_crsp.sort_values(by=['ticker', 'permno', 'fpedats', 'statpers'], inplace=True) -ibes_crsp.reset_index(inplace=True, drop=True) - -########################### -# Drop empty "last month" # -# Calculate HXZ RE # -########################### - -ibes_crsp = ibes_crsp[ibes_crsp['statpers_last_month'].notna()] -ibes_crsp['prc_adj'] = ibes_crsp['prc']/ibes_crsp['cfacpr'] -ibes_crsp = ibes_crsp[ibes_crsp['prc_adj']>0] -ibes_crsp['monthly_revision'] = (ibes_crsp['meanest'] - ibes_crsp['meanest_last_month'])/ibes_crsp['prc_adj'] - -ibes_crsp['permno'] = ibes_crsp['permno'].astype(int) -ibes_crsp['permno'] = ibes_crsp['permno'].astype(str) -ibes_crsp['fpedats'] = ibes_crsp['fpedats'].astype(str) -ibes_crsp['permno_fpedats'] = ibes_crsp['permno'].str.cat(ibes_crsp['fpedats'], sep='-') - -ibes_crsp = ibes_crsp.drop_duplicates(['permno_fpedats', 'statpers']) -ibes_crsp['count'] = ibes_crsp.groupby('permno_fpedats').cumcount() + 1 - -######################## -# Calculate RE (CJL) # -######################## - -ibes_crsp['monthly_revision_l1'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(1) -ibes_crsp['monthly_revision_l2'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(2) -ibes_crsp['monthly_revision_l3'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(3) -ibes_crsp['monthly_revision_l4'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(4) -ibes_crsp['monthly_revision_l5'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(5) -ibes_crsp['monthly_revision_l6'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(6) - -condlist = [ibes_crsp['count']==4, - ibes_crsp['count']==5, - ibes_crsp['count']==6, - ibes_crsp['count']>=7] -choicelist = [(ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'])/3, - (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'])/4, - (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'] + ibes_crsp['monthly_revision_l5'])/5, - (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'] + ibes_crsp['monthly_revision_l5'] + ibes_crsp['monthly_revision_l6'])/6] -ibes_crsp['re'] = np.select(condlist, choicelist, default=np.nan) - -ibes_crsp = ibes_crsp[ibes_crsp['count']>=4] -ibes_crsp = ibes_crsp.sort_values(by=['ticker', 'statpers', 'fpedats']) -ibes_crsp = ibes_crsp.drop_duplicates(['ticker', 'statpers']) - -ibes_crsp = ibes_crsp[['ticker', 'statpers', 'fpedats', 'anndats_act', 'curr_act', 'permno', 're']] -ibes_crsp.rename(columns={'statpers': 'date'}, inplace=True) - -with open('re.pkl', 'wb') as f: - pkl.dump(ibes_crsp, f) \ No newline at end of file diff --git a/pychars/hxz_sue.py b/pychars/hxz_sue.py deleted file mode 100755 index 8238cdb..0000000 --- a/pychars/hxz_sue.py +++ /dev/null @@ -1,106 +0,0 @@ -# Calculate HSZ Replicating Anomalies -# SUE: Standardized Unexpected Earnings (Earnings surprise) - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -from pandasql import * -import pickle as pkl - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -################### -# Compustat Block # -################### -comp = conn.raw_sql(""" - select gvkey, datadate, fyearq, fqtr, epspxq, ajexq - from comp.fundq - where indfmt = 'INDL' - and datafmt = 'STD' - and popsrc = 'D' - and consol = 'C' - and datadate >= '01/01/1959' - """) - -comp['datadate'] = pd.to_datetime(comp['datadate']) - -################### -# CCM Block # -################### -ccm = conn.raw_sql(""" - select gvkey, lpermno as permno, linktype, linkprim, - linkdt, linkenddt - from crsp.ccmxpf_linktable - where linktype in ('LU', 'LC') - """) - -ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) -ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) -# if linkenddt is missing then set to today date -ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) - -ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) - -# set link date bounds -ccm2 = ccm1[(ccm1['datadate']>=ccm1['linkdt']) & (ccm1['datadate']<=ccm1['linkenddt'])] -ccm2 = ccm2[['gvkey', 'permno', 'datadate', 'fyearq', 'fqtr', 'epspxq', 'ajexq']] - -# the time series of exspxq/ajexq -ccm2['eps'] = ccm2['epspxq']/ccm2['ajexq'] -ccm2.drop_duplicates(['permno', 'datadate'], inplace=True) - -# merge lag1 to lag9, then calculate stand deviation -ccm2 = ccm2[ccm2['eps'].notna()] -ccm2['count'] = ccm2.groupby('permno').cumcount() + 1 -ccm2.sort_values(by=['permno', 'datadate'], inplace=True) - -ccm2['e1'] = ccm2.groupby(['permno'])['eps'].shift(1) -ccm2['e2'] = ccm2.groupby(['permno'])['eps'].shift(2) -ccm2['e3'] = ccm2.groupby(['permno'])['eps'].shift(3) -ccm2['e4'] = ccm2.groupby(['permno'])['eps'].shift(4) -ccm2['e5'] = ccm2.groupby(['permno'])['eps'].shift(5) -ccm2['e6'] = ccm2.groupby(['permno'])['eps'].shift(6) -ccm2['e7'] = ccm2.groupby(['permno'])['eps'].shift(7) -ccm2['e8'] = ccm2.groupby(['permno'])['eps'].shift(8) - -condlist = [ccm2['count']<=6, - ccm2['count']==7, - ccm2['count']==8, - ccm2['count']>=9] -choicelist = [np.nan, - ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3']].std(axis=1), - ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3', 'e2']].std(axis=1), - ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3', 'e2', 'e1']].std(axis=1)] -ccm2['sue_std'] = np.select(condlist, choicelist, default=np.nan) - -ccm2['sue'] = (ccm2['eps'] - ccm2['e4'])/ccm2['sue_std'] - -# populate the quarterly sue to monthly -crsp_msf = conn.raw_sql(""" - select distinct date - from crsp.msf - where date >= '01/01/1959' - """) - -ccm2['datadate'] = pd.to_datetime(ccm2['datadate']) -ccm2['plus12m'] = ccm2['datadate'] + np.timedelta64(12, 'M') -ccm2['plus12m'] = ccm2['plus12m'] + MonthEnd(0) - -df = sqldf("""select a.*, b.date - from ccm2 a left join crsp_msf b - on a.datadate <= b.date - and a.plus12m >= b.date - order by a.permno, b.date, a.datadate desc;""", globals()) - -df = df.drop_duplicates(['permno', 'date']) -df['datadate'] = pd.to_datetime(df['datadate']) -df = df[['gvkey', 'permno', 'datadate', 'date', 'sue']] - -with open('sue.pkl', 'wb') as f: - pkl.dump(df, f) \ No newline at end of file diff --git a/pychars/iclink.py b/pychars/iclink.py deleted file mode 100755 index c630697..0000000 --- a/pychars/iclink.py +++ /dev/null @@ -1,241 +0,0 @@ -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -from pandasql import * -from fuzzywuzzy import fuzz - -# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/python-replications/linking-ibes-and-crsp-data-python/ -##################################### -# ICLINK: Link CRSP and IBES # -# June 2019 # -# Qingyi (Freda) Song Drechsler # -##################################### - -# This program replicates the SAS macro ICLINK -# to create a linking table between CRSP and IBES -# Output is a score reflecting the quality of the link -# Score = 0 (best link) to Score = 6 (worst link) -# -# More explanation on score system: -# - 0: BEST match: using (cusip, cusip dates and company names) -# or (exchange ticker, company names and 6-digit cusip) -# - 1: Cusips and cusip dates match but company names do not match -# - 2: Cusips and company names match but cusip dates do not match -# - 3: Cusips match but cusip dates and company names do not match -# - 4: tickers and 6-digit cusips match but company names do not match -# - 5: tickers and company names match but 6-digit cusips do not match -# - 6: tickers match but company names and 6-digit cusips do not match - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -######################### -# Step 1: Link by CUSIP # -######################### - -# 1.1 IBES: Get the list of IBES Tickers for US firms in IBES -_ibes1 = conn.raw_sql(""" - select ticker, cusip, cname, sdates from ibes.id - where usfirm=1 and cusip != '' - """) - -# Create first and last 'start dates' for a given cusip -# Use agg min and max to find the first and last date per group -# then rename to fdate and ldate respectively - -_ibes1_date = _ibes1.groupby(['ticker','cusip']).sdates.agg(['min', 'max'])\ -.reset_index().rename(columns={'min':'fdate', 'max':'ldate'}) - -# merge fdate ldate back to _ibes1 data -_ibes2 = pd.merge(_ibes1, _ibes1_date,how='left', on =['ticker','cusip']) -_ibes2 = _ibes2.sort_values(by=['ticker','cusip','sdates']) - -# keep only the most recent company name -# determined by having sdates = ldate -_ibes2 = _ibes2.loc[_ibes2.sdates == _ibes2.ldate].drop(['sdates'], axis=1) - -# 1.2 CRSP: Get all permno-ncusip combinations -_crsp1 = conn.raw_sql(""" - select permno, ncusip, comnam, namedt, nameenddt - from crsp.stocknames - where ncusip != '' - """) - -# first namedt -_crsp1_fnamedt = _crsp1.groupby(['permno','ncusip']).namedt.min().reset_index() - -# last nameenddt -_crsp1_lnameenddt = _crsp1.groupby(['permno','ncusip']).nameenddt.max().reset_index() - -# merge both -_crsp1_dtrange = pd.merge(_crsp1_fnamedt, _crsp1_lnameenddt, \ - on = ['permno','ncusip'], how='inner') - -# replace namedt and nameenddt with the version from the dtrange -_crsp1 = _crsp1.drop(['namedt'],axis=1).rename(columns={'nameenddt':'enddt'}) -_crsp2 = pd.merge(_crsp1, _crsp1_dtrange, on =['permno','ncusip'], how='inner') - -# keep only most recent company name -_crsp2 = _crsp2.loc[_crsp2.enddt ==_crsp2.nameenddt].drop(['enddt'], axis=1) - -# 1.3 Create CUSIP Link Table - -# Link by full cusip, company names and dates -_link1_1 = pd.merge(_ibes2, _crsp2, how='inner', left_on='cusip', right_on='ncusip')\ -.sort_values(['ticker','permno','ldate']) - -# Keep link with most recent company name -_link1_1_tmp = _link1_1.groupby(['ticker','permno']).ldate.max().reset_index() -_link1_2 = pd.merge(_link1_1, _link1_1_tmp, how='inner', on =['ticker', 'permno', 'ldate']) - - -# Calculate name matching ratio using FuzzyWuzzy - -# Note: fuzz ratio = 100 -> match perfectly -# fuzz ratio = 0 -> do not match at all - -# Comment: token_set_ratio is more flexible in matching the strings: -# fuzz.token_set_ratio('AMAZON.COM INC', 'AMAZON COM INC') -# returns value of 100 - -# fuzz.ratio('AMAZON.COM INC', 'AMAZON COM INC') -# returns value of 93 - -_link1_2['name_ratio'] = _link1_2.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1) - -# Note on parameters: -# The following parameters are chosen to mimic the SAS macro %iclink -# In %iclink, name_dist < 30 is assigned score = 0 -# where name_dist=30 is roughly 90% percentile in total distribution -# and higher name_dist means more different names. -# In name_ratio, I mimic this by choosing 10% percentile as cutoff to assign -# score = 0 - -# 10% percentile of the company name distance -name_ratio_p10 = _link1_2.name_ratio.quantile(0.10) - -# Function to assign score for companies matched by: -# full cusip and passing name_ratio -# or meeting date range requirement - -def score1(row): - if (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']) & (row['name_ratio'] >= name_ratio_p10): - score = 0 - elif (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']): - score = 1 - elif row['name_ratio'] >= name_ratio_p10: - score = 2 - else: - score = 3 - return score - -# assign size portfolio -_link1_2['score']=_link1_2.apply(score1, axis=1) -_link1_2 = _link1_2[['ticker','permno','cname','comnam','name_ratio','score']] -_link1_2 = _link1_2.drop_duplicates() - -########################## -# Step 2: Link by TICKER # -########################## - -# Find links for the remaining unmatched cases using Exchange Ticker - -# Identify remaining unmatched cases -_nomatch1 = pd.merge(_ibes2[['ticker']], _link1_2[['permno','ticker']], on='ticker', how='left') -_nomatch1 = _nomatch1.loc[_nomatch1.permno.isnull()].drop(['permno'], axis=1).drop_duplicates() - -# Add IBES identifying information - -ibesid = conn.raw_sql(""" select ticker, cname, oftic, sdates, cusip from ibes.id """) -ibesid = ibesid.loc[ibesid.oftic.notna()] - -_nomatch2 = pd.merge(_nomatch1, ibesid, how='inner', on=['ticker']) - -# Create first and last 'start dates' for Exchange Tickers -# Label date range variables and keep only most recent company name - -_nomatch3 = _nomatch2.groupby(['ticker', 'oftic']).sdates.agg(['min', 'max'])\ -.reset_index().rename(columns={'min':'fdate', 'max':'ldate'}) - -_nomatch3 = pd.merge(_nomatch2, _nomatch3, how='left', on=['ticker','oftic']) - -_nomatch3 = _nomatch3.loc[_nomatch3.sdates == _nomatch3.ldate] - -# Get entire list of CRSP stocks with Exchange Ticker information - -_crsp_n1 = conn.raw_sql(""" select ticker, comnam, permno, ncusip, namedt, nameenddt - from crsp.stocknames """) - -_crsp_n1 = _crsp_n1.loc[_crsp_n1.ticker.notna()].sort_values(by=['permno','ticker','namedt']) - -# Arrange effective dates for link by Exchange Ticker - -_crsp_n1_namedt = _crsp_n1.groupby(['permno','ticker']).namedt.min().reset_index().rename(columns={'min':'namedt'}) -_crsp_n1_nameenddt = _crsp_n1.groupby(['permno','ticker']).nameenddt.max().reset_index().rename(columns={'max':'nameenddt'}) - -_crsp_n1_dt = pd.merge(_crsp_n1_namedt, _crsp_n1_nameenddt, how = 'inner', on=['permno','ticker']) - -_crsp_n1 = _crsp_n1.rename(columns={'namedt': 'namedt_ind', 'nameenddt':'nameenddt_ind'}) - -_crsp_n2 = pd.merge(_crsp_n1, _crsp_n1_dt, how ='left', on = ['permno','ticker']) - -_crsp_n2 = _crsp_n2.rename(columns={'ticker':'crsp_ticker'}) -_crsp_n2 = _crsp_n2.loc[_crsp_n2.nameenddt_ind == _crsp_n2.nameenddt].drop(['namedt_ind', 'nameenddt_ind'], axis=1) - -# Merge remaining unmatched cases using Exchange Ticker -# Note: Use ticker date ranges as exchange tickers are reused overtime - -_link2_1 = pd.merge(_nomatch3, _crsp_n2, how='inner', left_on=['oftic'], right_on=['crsp_ticker']) -_link2_1 = _link2_1.loc[(_link2_1.ldate>=_link2_1.namedt) & (_link2_1.fdate<=_link2_1.nameenddt)] - - -# Score using company name using 6-digit CUSIP and company name spelling distance -_link2_1['name_ratio'] = _link2_1.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1) - -_link2_2 = _link2_1 -_link2_2['cusip6'] = _link2_2.apply(lambda x: x.cusip[:6], axis=1) -_link2_2['ncusip6'] = _link2_2.apply(lambda x: x.ncusip[:6], axis=1) - -# Score using company name using 6-digit CUSIP and company name spelling distance - -def score2(row): - if (row['cusip6']==row['ncusip6']) & (row['name_ratio'] >= name_ratio_p10): - score = 0 - elif (row['cusip6']==row['ncusip6']): - score = 4 - elif row['name_ratio'] >= name_ratio_p10: - score = 5 - else: - score = 6 - return score - -# assign size portfolio -_link2_2['score']=_link2_2.apply(score2, axis=1) - -# Some companies may have more than one TICKER-PERMNO link -# so re-sort and keep the case (PERMNO & Company name from CRSP) -# that gives the lowest score for each IBES TICKER - -_link2_2 = _link2_2[['ticker','permno','cname','comnam', 'name_ratio', 'score']].sort_values(by=['ticker','score']) -_link2_2_score = _link2_2.groupby(['ticker']).score.min().reset_index() - -_link2_3 = pd.merge(_link2_2, _link2_2_score, how='inner', on=['ticker', 'score']) -_link2_3 = _link2_3[['ticker','permno','cname','comnam','score']].drop_duplicates() - -##################################### -# Step 3: Finalize LInks and Scores # -##################################### -# Combine the output from both linking procedures. Store the output data for future usage - -iclink = _link1_2.append(_link2_3) - -# Storing iclink for other program usage -import pickle as pkl - -with open('iclink.pkl', 'wb') as f: - pkl.dump(iclink, f) \ No newline at end of file diff --git a/pychars/impute_rank_output.py b/pychars/impute_rank_output.py deleted file mode 100755 index 5940b64..0000000 --- a/pychars/impute_rank_output.py +++ /dev/null @@ -1,114 +0,0 @@ -import pandas as pd -import pickle as pkl -import numpy as np -import wrds -from functions import * - -#################### -# All Stocks # -#################### - -with open('chars_a.pkl', 'rb') as f: - chars_a = pkl.load(f) - -chars_a = chars_a.dropna(subset=['permno']) -chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int) -chars_a['jdate'] = pd.to_datetime(chars_a['jdate']) -chars_a = chars_a.drop_duplicates(['permno', 'jdate']) - -with open('chars_q_raw.pkl', 'rb') as f: - chars_q = pkl.load(f) - -# use annual variables to fill na of quarterly variables -chars_q = fillna_atq(df_q=chars_q, df_a=chars_a) - -# adm is annual variable -adm = chars_a[['permno', 'jdate', 'adm']] -chars_q = pd.merge(chars_q, adm, how='left', on=['permno', 'jdate']) - -# impute missing values, you can choose different func form functions, such as ffi49/ffi10 -chars_q_impute = chars_q.copy() -chars_q_impute['sic'] = chars_q_impute['sic'].astype(int) -chars_q_impute['jdate'] = pd.to_datetime(chars_q_impute['jdate']) - -chars_q_impute['ffi49'] = ffi49(chars_q_impute) -chars_q_impute['ffi49'] = chars_q_impute['ffi49'].fillna(49) # we treat na in ffi49 as 'other' -chars_q_impute['ffi49'] = chars_q_impute['ffi49'].astype(int) - -# there are two ways to impute: industrial median or mean -chars_q_impute = fillna_ind(chars_q_impute, method='median', ffi=49) -# we use all stocks' mean or median to fill na that are not filled by value of ffi -chars_q_impute = fillna_all(chars_q_impute, method='median') -chars_q_impute['re'] = chars_q_impute['re'].fillna(0) # re use IBES database, there are lots of missing data - -chars_q_impute['year'] = chars_q_impute['jdate'].dt.year -chars_q_impute = chars_q_impute[chars_q_impute['year'] >= 1972] -chars_q_impute = chars_q_impute.drop(['year'], axis=1) - -with open('chars_q_impute.pkl', 'wb') as f: - pkl.dump(chars_impute, f, protocol=4) - -# standardize characteristics -chars_q_rank = standardize(chars_q) -chars_q_rank['year'] = chars_q_rank['jdate'].dt.year -chars_q_rank = chars_q_rank[chars_q_rank['year'] >= 1972] -chars_q_rank = chars_q_rank.drop(['year'], axis=1) - -with open('chars_q_rank.pkl', 'wb') as f: - pkl.dump(chars_rank, f, protocol=4) - -#################### -# SP1500 # -#################### -conn = wrds.Connection() - -# prepare S&P 1500 version, gvkeyx for sp600: 030824,for sp400: 024248,for sp500: 000003 -sp1500_index = conn.raw_sql('select * from comp.idxcst_his') -sp1500_index = sp1500_index[(sp1500_index['gvkeyx'] == '000003') | (sp1500_index['gvkeyx'] == '024248') - | (sp1500_index['gvkeyx'] == '030824')] - -sp1500_index = sp1500_index[['gvkey', 'from', 'thru']] -sp1500_index['gvkey'] = sp1500_index['gvkey'].astype(int) -sp1500_index['from'] = pd.to_datetime(sp1500_index['from']) -sp1500_index['thru'] = pd.to_datetime(sp1500_index['thru']) -sp1500_index['thru'] = sp1500_index['thru'].fillna(pd.to_datetime('today')) - -chars_q = pd.merge(chars_q, sp1500_index, how='left', on=['gvkey']) -sp1500 = chars_q.dropna(subset=['from'], axis=0) -sp1500 = sp1500[(sp1500['jdate'] >= sp1500['from']) & (sp1500['jdate'] <= sp1500['thru'])] -sp1500 = sp1500.drop(['from', 'thru'], axis=1) -sp1500 = sp1500.drop_duplicates(['gvkey', 'jdate']) - -# for test -# test = sp1500.groupby(['jdate'])['gvkey'].nunique() - -# impute missing values, you can choose different func form functions, such as ffi49/ffi10 -sp1500_impute = sp1500.copy() -sp1500_impute['sic'] = sp1500_impute['sic'].astype(int) -sp1500_impute['jdate'] = pd.to_datetime(sp1500_impute['jdate']) - -sp1500_impute['ffi49'] = ffi49(sp1500_impute) -sp1500_impute['ffi49'] = sp1500_impute['ffi49'].fillna(49) # we treat na in ffi49 as 'other' -sp1500_impute['ffi49'] = sp1500_impute['ffi49'].astype(int) - -# there are two ways to impute: industrial median or mean -sp1500_impute = fillna_ind(sp1500_impute, method='median', ffi=49) -# we use all stocks' mean or median to fill na that are not filled by value of ffi -sp1500_impute = fillna_all(sp1500_impute, method='median') -sp1500_impute['re'] = sp1500_impute['re'].fillna(0) # re use IBES database, there are lots of missing data - -sp1500_impute['year'] = sp1500_impute['jdate'].dt.year -sp1500_impute = sp1500_impute[sp1500_impute['year'] >= 1972] -sp1500_impute = sp1500_impute.drop(['year'], axis=1) - -with open('sp1500_impute.pkl', 'wb') as f: - pkl.dump(sp1500_impute, f, protocol=4) - -# standardize characteristics -sp1500_rank = standardize(sp1500) -sp1500_rank['year'] = sp1500_rank['jdate'].dt.year -sp1500_rank = sp1500_rank[sp1500_rank['year'] >= 1972] -sp1500_rank = sp1500_rank.drop(['year'], axis=1) - -with open('sp1500_rank.pkl', 'wb') as f: - pkl.dump(sp1500_rank, f, protocol=4) \ No newline at end of file diff --git a/pychars/merge_chars.py b/pychars/merge_chars.py deleted file mode 100755 index e042a3d..0000000 --- a/pychars/merge_chars.py +++ /dev/null @@ -1,86 +0,0 @@ -import pandas as pd -import pickle as pkl -from pandas.tseries.offsets import * -import wrds - -with open('chars_q.pkl', 'rb') as f: - chars_q = pkl.load(f) - -chars_q = chars_q.dropna(subset=['permno']) -chars_q[['permno', 'gvkey']] = chars_q[['permno', 'gvkey']].astype(int) -chars_q['jdate'] = pd.to_datetime(chars_q['jdate']) -chars_q = chars_q.drop_duplicates(['permno', 'jdate']) - -with open('beta.pkl', 'rb') as f: - beta = pkl.load(f) - -beta['permno'] = beta['permno'].astype(int) -beta['jdate'] = pd.to_datetime(beta['date']) + MonthEnd(0) -beta = beta[['permno', 'jdate', 'beta']] -beta = beta.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, beta, how='left', on=['permno', 'jdate']) - -with open('rvar_capm.pkl', 'rb') as f: - rvar_capm = pkl.load(f) - -rvar_capm['permno'] = rvar_capm['permno'].astype(int) -rvar_capm['jdate'] = pd.to_datetime(rvar_capm['date']) + MonthEnd(0) -rvar_capm = rvar_capm[['permno', 'jdate', 'rvar_capm']] -rvar_capm = rvar_capm.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, rvar_capm, how='left', on=['permno', 'jdate']) - -with open('rvar_mean.pkl', 'rb') as f: - rvar_mean = pkl.load(f) - -rvar_mean['permno'] = rvar_mean['permno'].astype(int) -rvar_mean['jdate'] = pd.to_datetime(rvar_mean['date']) + MonthEnd(0) -rvar_mean = rvar_mean[['permno', 'jdate', 'rvar_mean']] -rvar_mean = rvar_mean.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, rvar_mean, how='left', on=['permno', 'jdate']) - -with open('rvar_ff3.pkl', 'rb') as f: - rvar_ff3 = pkl.load(f) - -rvar_ff3['permno'] = rvar_ff3['permno'].astype(int) -rvar_ff3['jdate'] = pd.to_datetime(rvar_ff3['date']) + MonthEnd(0) -rvar_ff3 = rvar_ff3[['permno', 'jdate', 'rvar_ff3']] -rvar_ff3 = rvar_ff3.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, rvar_ff3, how='left', on=['permno', 'jdate']) - -with open('sue.pkl', 'rb') as f: - sue = pkl.load(f) - -sue['permno'] = sue['permno'].astype(int) -sue['jdate'] = pd.to_datetime(sue['date']) + MonthEnd(0) -sue = sue[['permno', 'jdate', 'sue']] -sue = sue.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, sue, how='left', on=['permno', 'jdate']) - -with open('re.pkl', 'rb') as f: - re = pkl.load(f) - -re['permno'] = re['permno'].astype(int) -re['jdate'] = pd.to_datetime(re['date']) + MonthEnd(0) -re = re[['permno', 'jdate', 're']] -re = re.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, re, how='left', on=['permno', 'jdate']) - -with open('abr.pkl', 'rb') as f: - abr = pkl.load(f) - -abr['permno'] = abr['permno'].astype(int) -abr['jdate'] = pd.to_datetime(abr['date']) + MonthEnd(0) -abr = abr[['permno', 'jdate', 'abr']] -abr = abr.drop_duplicates(['permno', 'jdate']) - -chars_q = pd.merge(chars_q, abr, how='left', on=['permno', 'jdate']) - -# save data -with open('chars_q_raw.pkl', 'wb') as f: - pkl.dump(chars_q, f, protocol=4) \ No newline at end of file diff --git a/pychars/rvar_capm.py b/pychars/rvar_capm.py deleted file mode 100755 index fa3a01c..0000000 --- a/pychars/rvar_capm.py +++ /dev/null @@ -1,168 +0,0 @@ -# CAPM residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf - from crsp.dsf as a - left join ff.factors_daily as b - on a.date=b.date - where a.date >= '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate residual # -###################### - - -def get_res_var(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - # if observations in last 3 months are less 21, we drop the rvar of this month - if temp['permno'].count() < 21: - pass - else: - rolling_window = temp['permno'].count() - index = temp.tail(1).index - X = pd.DataFrame() - X[['mktrf']] = temp[['mktrf']] - X['intercept'] = 1 - X = X[['intercept', 'mktrf']] - X = np.mat(X) - Y = np.mat(temp[['exret']]) - res = (np.identity(rolling_window) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) - res_var = res.var(ddof=1) - df.loc[index, 'rvar'] = res_var - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_res_var, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling -crsp = crsp.rename(columns={'rvar': 'rvar_capm'}) -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'rvar_capm']] - -with open('rvar_capm.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/pychars/rvar_ff3.py b/pychars/rvar_ff3.py deleted file mode 100755 index 36561a0..0000000 --- a/pychars/rvar_ff3.py +++ /dev/null @@ -1,201 +0,0 @@ -# Fama & French 3 factors residual variance -# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling -# To get a faster speed, we split the big dataframe into small ones -# Then using different process to calculate the variance -# We use 20 process to calculate variance, you can change the number of process according to your CPU situation -# You can use the following code to check your CPU situation -# import multiprocessing -# multiprocessing.cpu_count() - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf, b.smb, b.hml - from crsp.dsf as a - left join ff.factors_daily as b - on a.date=b.date - where a.date > '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate the beta # -###################### -# function that get multiple beta -'''' -rolling_window = 60 # 60 trading days -crsp['beta_mktrf'] = np.nan -crsp['beta_smb'] = np.nan -crsp['beta_hml'] = np.nan - - -def get_beta(df): - """ - The original idea of calculate beta is using formula (X'MX)^(-1)X'MY, - where M = I - 1(1'1)^{-1}1, I is a identity matrix. - - """ - temp = crsp.loc[df.index] # extract the rolling sub dataframe from original dataframe - X = np.mat(temp[['mktrf', 'smb', 'hml']]) - Y = np.mat(temp[['exret']]) - ones = np.mat(np.ones(rolling_window)).T - M = np.identity(rolling_window) - ones.dot((ones.T.dot(ones)).I).dot(ones.T) - beta = (X.T.dot(M).dot(X)).I.dot((X.T.dot(M).dot(Y))) - crsp['beta_mktrf'].loc[df.index[-1:]] = beta[0] - crsp['beta_smb'].loc[df.index[-1:]] = beta[1] - crsp['beta_hml'].loc[df.index[-1:]] = beta[2] - return 0 # we do not need the rolling outcome since rolling cannot return different values in different columns - - -# calculate beta through rolling window -crsp_temp = crsp.groupby('permno').rolling(rolling_window).apply(get_beta, raw=False) -''' - -###################### -# Calculate residual # -###################### - - -def get_res_var(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - # if observations in last 3 months are less 21, we drop the rvar of this month - if temp['permno'].count() < 21: - pass - else: - rolling_window = temp['permno'].count() - index = temp.tail(1).index - X = pd.DataFrame() - X[['mktrf', 'smb', 'hml']] = temp[['mktrf', 'smb', 'hml']] - X['intercept'] = 1 - X = X[['intercept', 'mktrf', 'smb', 'hml']] - X = np.mat(X) - Y = np.mat(temp[['exret']]) - res = (np.identity(rolling_window) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) - res_var = res.var(ddof=1) - df.loc[index, 'rvar'] = res_var - return df - - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_res_var, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling -crsp = crsp.rename(columns={'rvar': 'rvar_ff3'}) -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'rvar_ff3']] - -with open('rvar_ff3.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/pychars/rvar_mean.py b/pychars/rvar_mean.py deleted file mode 100755 index 42297f4..0000000 --- a/pychars/rvar_mean.py +++ /dev/null @@ -1,150 +0,0 @@ -# RVAR mean - -import pandas as pd -import numpy as np -import datetime as dt -import wrds -from dateutil.relativedelta import * -from pandas.tseries.offsets import * -import datetime -import pickle as pkl -import multiprocessing as mp - -################### -# Connect to WRDS # -################### -conn = wrds.Connection() - -# CRSP Block -crsp = conn.raw_sql(""" - select permno, date, ret - from crsp.dsf - where date >= '01/01/1959' - """) - -# sort variables by permno and date -crsp = crsp.sort_values(by=['permno', 'date']) - -# change variable format to int -crsp['permno'] = crsp['permno'].astype(int) - -# Line up date to be end of month -crsp['date'] = pd.to_datetime(crsp['date']) - -# find the closest trading day to the end of the month -crsp['monthend'] = crsp['date'] + MonthEnd(0) -crsp['date_diff'] = crsp['monthend'] - crsp['date'] -date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() -date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame -date_temp.reset_index(inplace=True) -date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) -crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) -crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) - -# label every date of month end -crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() - -# label numbers of months for a firm -month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) -month_num = month_num.astype(int) -month_num = month_num.reset_index(drop=True) - -# mark the number of each month to each day of this month -crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') - -# crate a firm list -df_firm = crsp.drop_duplicates(['permno']) -df_firm = df_firm[['permno']] -df_firm['permno'] = df_firm['permno'].astype(int) -df_firm = df_firm.reset_index(drop=True) -df_firm = df_firm.reset_index() -df_firm = df_firm.rename(columns={'index': 'count'}) -df_firm['month_num'] = month_num - -###################### -# Calculate variance # -###################### - - -def get_ret_var(df, firm_list): - """ - - :param df: stock dataframe - :param firm_list: list of firms matching stock dataframe - :return: dataframe with variance of residual - """ - for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): - prog = prog + 1 - print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) - for i in range(count + 1): - # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. - temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] - # if observations in last 3 months are less 21, we drop the rvar of this month - if temp['permno'].count() < 21: - pass - else: - index = temp.tail(1).index - ret_var = temp['ret'].var() - df.loc[index, 'rvar'] = ret_var - return df - -def sub_df(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe - """ - # we use dict to store different sub dataframe - temp = {} - for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): - print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) - if i == 0: # to get the left point - temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - else: - temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( - df_firm['count'] <= df_firm['count'].quantile(i + step))] - temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', - on='permno').dropna(subset=['count']) - return temp - - -def main(start, end, step): - """ - - :param start: the quantile to start cutting, usually it should be 0 - :param end: the quantile to end cutting, usually it should be 1 - :param step: quantile step - :return: a dataframe with calculated variance of residual - """ - df = sub_df(start, end, step) - pool = mp.Pool() - p_dict = {} - for i in range(int((end-start)/step)): - p_dict['p' + str(i)] = pool.apply_async(get_ret_var, (df['crsp%s' % i], df['firm%s' % i],)) - pool.close() - pool.join() - result = pd.DataFrame() - print('processing pd.concat') - for h in range(int((end-start)/step)): - result = pd.concat([result, p_dict['p%s' % h].get()]) - return result - - -# calculate variance of residual through rolling window -# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub -# dataframes here, so the function will use 20 cores to calculate variance of residual. -if __name__ == '__main__': - crsp = main(0, 1, 0.05) - -# process dataframe -crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling -crsp = crsp.rename(columns={'rvar': 'rvar_mean'}) -crsp = crsp.reset_index(drop=True) -crsp = crsp[['permno', 'date', 'rvar_mean']] - -with open('rvar_mean.pkl', 'wb') as f: - pkl.dump(crsp, f) \ No newline at end of file diff --git a/qsub/.DS_Store b/qsub/.DS_Store deleted file mode 100755 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 PyProgram.out -sas check_crsp.sas - -## if you need to add cpu and memory -##$ -pe onenode 8 -##$ -l m_mem_free=6G diff --git a/setup-wrds.py b/setup-wrds.py deleted file mode 100755 index 70d48d6..0000000 --- a/setup-wrds.py +++ /dev/null @@ -1,11 +0,0 @@ -# set up the .pgpass file -# then you don't need to type in password -import wrds -db = wrds.Connection(wrds_username='xinhe97') -# Enter your WRDS username [joe]: -# Enter your password: -db.create_pgpass_file() -db.close() -# check again -db = wrds.Connection(wrds_username='xinhe97') -db.close() From 0e902c6152d8d21875d5dda4967f719c768a202e Mon Sep 17 00:00:00 2001 From: velonisa Date: Mon, 1 Mar 2021 22:24:06 +0800 Subject: [PATCH 14/15] add --- Chars60_description.csv | 507 ++++++++ README.md | 109 ++ char60/abr.py | 236 ++++ char60/accounting_100.py | 1643 +++++++++++++++++++++++++ char60/accounting_60.py | 1215 ++++++++++++++++++ char60/beta.py | 164 +++ char60/bid_ask_spread.py | 160 +++ char60/functions.py | 452 +++++++ char60/iclink.py | 241 ++++ char60/ill.py | 158 +++ char60/impute_rank_output_bchmk_60.py | 164 +++ char60/maxret_d.py | 158 +++ char60/merge_chars_60.py | 294 +++++ char60/pkl_to_csv.py | 29 + char60/re.py | 120 ++ char60/rvar_capm.py | 168 +++ char60/rvar_ff3.py | 201 +++ char60/rvar_mean.py | 150 +++ char60/std_dolvol.py | 158 +++ char60/std_turn.py | 158 +++ char60/sue.py | 106 ++ char60/zerotrade.py | 161 +++ py-dgtw/dgtw.py | 479 +++++++ py-ff3/ff3.py | 280 +++++ py-pead/pead.py | 538 ++++++++ pychars/accounting.py | 851 +++++++++++++ pychars/beta.py | 70 ++ pychars/functions.py | 445 +++++++ pychars/hxz_abr.py | 236 ++++ pychars/hxz_re.py | 120 ++ pychars/hxz_sue.py | 106 ++ pychars/iclink.py | 241 ++++ pychars/impute_rank_output.py | 114 ++ pychars/merge_chars.py | 86 ++ pychars/rvar_capm.py | 168 +++ pychars/rvar_ff3.py | 201 +++ pychars/rvar_mean.py | 150 +++ qsub/check_crsp.sas | 2 + qsub/submit.sh | 11 + setup-wrds.py | 11 + 40 files changed, 10861 insertions(+) create mode 100644 Chars60_description.csv create mode 100755 README.md create mode 100755 char60/abr.py create mode 100644 char60/accounting_100.py create mode 100755 char60/accounting_60.py create mode 100755 char60/beta.py create mode 100755 char60/bid_ask_spread.py create mode 100755 char60/functions.py create mode 100755 char60/iclink.py create mode 100755 char60/ill.py create mode 100755 char60/impute_rank_output_bchmk_60.py create mode 100755 char60/maxret_d.py create mode 100755 char60/merge_chars_60.py create mode 100755 char60/pkl_to_csv.py create mode 100755 char60/re.py create mode 100755 char60/rvar_capm.py create mode 100755 char60/rvar_ff3.py create mode 100755 char60/rvar_mean.py create mode 100755 char60/std_dolvol.py create mode 100755 char60/std_turn.py create mode 100755 char60/sue.py create mode 100755 char60/zerotrade.py create mode 100755 py-dgtw/dgtw.py create mode 100755 py-ff3/ff3.py create mode 100755 py-pead/pead.py create mode 100755 pychars/accounting.py create mode 100755 pychars/beta.py create mode 100755 pychars/functions.py create mode 100755 pychars/hxz_abr.py create mode 100755 pychars/hxz_re.py create mode 100755 pychars/hxz_sue.py create mode 100755 pychars/iclink.py create mode 100755 pychars/impute_rank_output.py create mode 100755 pychars/merge_chars.py create mode 100755 pychars/rvar_capm.py create mode 100755 pychars/rvar_ff3.py create mode 100755 pychars/rvar_mean.py create mode 100755 qsub/check_crsp.sas create mode 100755 qsub/submit.sh create mode 100755 setup-wrds.py diff --git a/Chars60_description.csv b/Chars60_description.csv new file mode 100644 index 0000000..835b03f --- /dev/null +++ b/Chars60_description.csv @@ -0,0 +1,507 @@ +Num,Acronym,Description,Author,Pub Year,Category,Main Formula,Other Formula,CRSP,Compustat(annual),Compustat(quarterly),IBES,description +A.1.2,Abr1,"cumulative abnormal returns around earnings announcement dates, 1-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,,,,,,p63 +A.1.2,Abr12,"cumulative abnormal returns around earnings announcement dates, 6-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,,,,,, +A.1.2,Abr6,"cumulative abnormal returns around earnings announcement dates, 12-month holding period","Chan, Jegadeesh, and Lakonishok",1996,Momentum,,,,,,, +A.2.1,Bm,Book-to-market equity,"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth,data_rawq['bm'] = data_rawq['beq']/data_rawq['me'],"data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) +data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) +* 'me' from rawq",1,0,1,0,"At the end of June of each year t, we split stocks into deciles based on Bm, which is the book equity for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Bm. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Following Davis, Fama, and French (2000), we measure book equity as stockholders' book equity, plus balance sheet deferred taxes and investment tax credit (Compustat annual item TXDITC) if available, minus the book value of preferred stock. Stockholders' equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders' equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depending on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock." +A.2.10,"Ep q 1, Ep q 6, and Ep q 12","Q Quarterly Earnings-to-price(1-month holding period), uarterly Earnings-to-price(6-month holding period), Quarterly Earnings-to-price(12-month holding period), ",Basu,1983,Value-versus-growth,,,,,,,"At the beginning of each month t, we split stocks into deciles based on quarterly earnings-to-price, Epq, which is income before extraordinary items (Compustat quarterly item IBQ) divided by the market equity (from CRSP) at the end of month t - 1. Before 1972, we use quarterly earnings from fiscal quarters ending at least four months prior to the portfolio formation. Starting from 1972, we use quarterly earnings from the most recent quarterly earnings announcement dates (item RDQ). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent quarterly earnings to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Firms with non-positive earnings are excluded. For firms with more than one share class, we merge the market equity for all share classes before computing Epq. We calculate decile returns for the current month t (Epq1), from month t to t + 5 (Epq6), and from month t to t + 11 (Epq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Epq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Epq6 decile." +A.2.12,Cp,Cash flow-to-price,"Lakonishok, Shleifer, and Vishny ",1994,Value-versus-growth,data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'],"data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] +* 'me' from rawa +",1,1,0,0,"At the end of June of each year t, we split stocks into deciles based on cash flow-to-price, Cf, which is cash flows for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. Cash flows are income before extraordinary items (Com- pustat annual item IB) plus depreciation (item DP)). For firms with more than one share class, we merge the market equity for all share classes before computing Cp. Firms with non-positive cash flows are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.13,"Cpq1, Cpq6, Cpq12","Quarterly Cash Flow-to-price (1-month holding period), Quarterly Cash Flow-to-price (6-month holding period), Quarterly Cash Flow-to-price (12-month holding period)","Lakonishok, Shleifer, and Vishny",1994,Value-versus-growth,,,,,,,"At the beginning of each month t, we split stocks into deciles based on quarterly cash flow-to-price, +Cpq, which is cash flows for the latest fiscal quarter ending at least four months ago divided by the market equity (from CRSP) at the end of month t - 1. Quarterly cash flows are income before extraordinary items (Compustat quarterly item IBQ) plus depreciation (item DPQ). For firms with more than one share class, we merge the market equity for all share classes before computing Cpq. Firms with non-positive cash flows are excluded. We calculate decile returns for the current month t (Epq1), from month t to t + 5 (Epq6), and from month t to t + 11 (Epq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Epq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Epq6 decile." +A.2.14,Dp(dy),Dividend yield,Litzenberger and Ramaswamy,1979,Value-versus-growth,"crsp_mom['dy'] = ttm12(series='mdivpay', df=crsp_mom)/crsp_mom['me']","crsp_mom['permno'] = crsp_mom['permno'].astype(int) +crsp_mom['ret'] = crsp_mom['ret'].fillna(0) +crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity +crsp_mom['retx'] = np.where(crsp_mom['me'].isnull(), 0, crsp_mom['retx']) +# dy +crsp_mom['me_l1'] = crsp_mom.groupby(['permno'])['me'].shift(1) +crsp_mom['retdy'] = crsp_mom['ret'] - crsp_mom['retx'] +crsp_mom['mdivpay'] = crsp_mom['retdy']*crsp_mom['me_l1'] +",1,0,0,0,"At the end of June of each year t, we sort stocks into deciles based on dividend yield, Dp, which is the total dividends paid out from July of year t - 1 to June of t divided by the market equity (from CRSP) at the end of June of t. We calculate monthly dividends as the begin-of-month market equity times the difference between returns with and without dividends. Monthly dividends are then accumulated from July of t - 1 to June of t. We exclude firms that do not pay dividends. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.16,Op and Nop, (Net) Payout Yield,"Richardson, and Roberts",2007,Value-versus-growth,"data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] +data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] )","# op +data_rawq['xintq0'] = np.where(data_rawq['xintq'].isnull(), 0, data_rawq['xintq']) +data_rawq['xsgaq0'] = np.where(data_rawq['xsgaq'].isnull(), 0, data_rawq['xsgaq']) +data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) +data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) +data_rawq['beq_l4'] = data_rawq.groupby(['permno'])['beq'].shift(4) + +#nop +data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] +* 'me' from rawa +data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] +",1,1,1,0,"Per Boudoukh, Michaely, Richardson, and Roberts (2007), total payouts are dividends on common stock (Compustat annual item DVC) plus repurchases. Repurchases are the total expenditure on the purchase of common and preferred stocks (item PRSTKC) plus any reduction (negative change over the prior year) in the value of the net number of preferred stocks outstanding (item PSTKRV). Net payouts equal total payouts minus equity issuances, which are the sale of common and preferred stock (item SSTK) minus any increase (positive change over the prior year) in the value of the net number of preferred stocks outstanding (item PSTKRV). At the end of June of each year t, we sort stocks into deciles based on total payouts (net payouts) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1 (Op and Nop, respectively). For firms with more than one share class, we merge the market equity for all share classes before computing Op and Nop. Firms with non-positive total payouts (zero net payouts) are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the data on total expenditure and the sale of common and preferred stocks start in 1971, the Op and Nop portfolios start in July 1972." +A.2.2,Bmj,Book-to-June-end market equity, Asness and Frazzini ,2013,Value-versus-growth,data_rawa['bmj'] = data_rawa['be_per'] / data_rawa['prc'] ,"# clean up csho +comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) +data_rawa['txditc'] = data_rawa['txditc'].fillna(0) +* 'ps' +data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] +data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) +data_rawa['be_per'] = data_rawa['be'] / data_rawa['csho'] +",0,1,0,0,"Following Asness and Frazzini (2013), at the end of June of each year t, we sort stocks into deciles based on Bmj, which is book equity per share for the fiscal year ending in calendar year t - 1 divided by share price (from CRSP) at the end of June of t. We adjust for any stock splits between the fiscal year end and the end of June. Book equity per share is book equity divided by the num- ber of shares outstanding (Compustat annual item CSHO). Following Davis, Fama, and French (2000), we measure book equity as stockholders' book equity, plus balance sheet deferred taxes and investment tax credit (item TXDITC) if available, minus the book value of preferred stock. Stockholders' equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders' equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depending on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.20,Em,Enterprise multiple,Loughran and Wellman,2011,Value-versus-growth,data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'],"* 'me' from rawa +data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che']",1,1,0,0,"Enterprise multiple, Em, is enterprise value divided by operating income before depreciation (Com- pustat annual item OIBDP). Enterprise value is the market equity plus the total debt (item DLC plus item DLTT) plus the book value of preferred stocks (item PSTKRV) minus cash and short- term investments (item CHE). At the end of June of each year t, we split stocks into deciles based on Em for the fiscal year ending in calendar year t-1. The Market equity (from CRSP) is measured at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Em. Firms with negative enterprise value or operating income before depreciation are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.21,"Emq1, Emq6","Quarterly Enterprise multiple (1-month holding period), Quarterly Enterprise multiple (6-month holding period)",Loughran and Wellman,2011,Value-versus-growth,,,,,,,"Emq, is enterprise value scaled by operating income before depreciation (Compustat quarterly item OIBDPQ). Enterprise value is the market equity plus total debt (item DLCQ plus item DLTTQ) plus the book value of preferred stocks (item PSTKQ) minus cash and short-term investments (item CHEQ). At the beginning of each month t, we split stocks into deciles on Emq for the latest fiscal quarter ending at least four months ago. The Market equity (from CRSP) is measured at the end of month t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Emq. Firms with negative enterprise value or operating income before depreciation are excluded. Monthly decile returns are calculated for the current month t (Emq1), from month t to t + 5 (Emq6), and from month t to t + 11 (Emq12), and the deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Emq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Emq6 decile. For sufficient data coverage, the EMq portfolios start in January 1975." +A.2.22,Sp,Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth,data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'],"data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) +* 'me' from rawq",1,0,1,0,"At the end of June of each year t, we sort stocks into deciles based on sales-to-price, Sp, which is sales (Compustat annual item SALE) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Sp. Firms with non-positive sales are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.23,"Sp q 1, Sp q 6, and Sp q 12",Quarterly Sales-to-price,"Barbee, Mukherji, and Raines",1996,Value-versus-growth,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on quarterly sales-to-price, Spq, which is sales (Compustat quarterly item SALEQ) divided by the market equity at the end of month t - 1. Before 1972, we use quarterly sales from fiscal quarters ending at least four months prior to the portfolio formation. Starting from 1972, we use quarterly sales from the most recent quarterly earnings announcement dates (item RDQ). Sales are generally announced with earnings during quarterly earnings announcements (Jegadeesh and Livnat 2006). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent quarterly sales to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Firms with non- positive sales are excluded. For firms with more than one share class, we merge the market equity for all share classes before computing Spq. Monthly decile returns are calculated for the current month t (Spq1), from month t to t + 5 (Spq6), and from month t to t + 11 (Spq12), and the deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Spq6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Spq6 decile." +A.2.24,Ocp,Operating Cash Flow-to-price,"Desai, Rajgopal, and Venkatachalam",2004,Value-versus-growth,"data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] +data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] )","* 'me' from rawa +data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf'])",1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on operating cash flows-to-price, Ocp, which is operating cash flows for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. Operating cash flows are measured as funds from operation (Compustat annual item FOPT) minus change in working capital (item WCAP) prior to 1988, and then as net cash flows from operating activities (item OANCF) stating from 1988. For firms with more than one share class, we merge the market equity for all share classes before computing Ocp. Firms with non-positive operating cash flows are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t+1. Because the data on funds from operation start in 1971, the Ocp portfolios start in July 1972. +" +A.2.26,Ir,Intangible Return,Daniel and Titman,2006,Value-versus-growth," +#Regression and get ir +#First get unique datelist +datelist = data_rawa['jdate'].unique() +for date in datelist: + temp = data_rawa[data_rawa['jdate'] == date] + n_row = temp.shape[0] + index = temp.index + X = pd.DataFrame() + X['bm5'] = temp['bm5'] + X['rB'] = temp['rB'] + X['intercept'] = 1 + X = X[['intercept','rB','bm5']] + X = np.mat(X) + Y = np.mat(temp[['ret5']]) + #These are residuals on one date + res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + #put residuals back into data_rawa + data_rawa.loc[index,'ir'] = res","* 'bm' from rawa +#ir +''' +#First calculate r(t-5,t). Then rb(t-5,t) and use Bm to perform linear regression and get residue +''' +#r(t-5,t):sum ret from t-5 to t (which is calendar year t-6 to t-1) +lag = pd.DataFrame() +for i in range(1,6): + lag['ret%s' % i] = data_rawa.groupby(['permno'])['ret'].shift(i) + +data_rawa['ret5'] = lag['ret1']+lag['ret2']+lag['ret3']+lag['ret4']+lag['ret5'] + +#bm_t-5 (bm of year t-5) +data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) + +#rB (five year log book return) +#Reference: jf_06 page8 by KENT DANIEL +data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] +",1,1,0,0,p77 +A.2.28,Ebp,Enterprise Book-to-price,"Penman, Richardson, and Tuna",2007,Value-versus-growth,data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['ber']) / (data_rawa['n_debt']+data_rawa['me']),"* 'me' from rawa +#Ebp +data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) +data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) +data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] +data_rawa['f_asse'] = data_rawa['che'] +# net debt : = 铿乶ancial liabilities - 铿乶ancial assets. +data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] +data_rawa['ber'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa']",1,1,0,0,"Following Penman, Richardson, and Tuna (2007), we measure enterprise book-to-price, Ebp, as the ratio of the book value of net operating assets (net debt plus book equity) to the market value of net operating assets (net debt plus market equity). Net Debt-to-price, Ndp, is the ratio of net debt to the market equity. Net debt is financial liabilities minus financial assets. We measure financial liabilities as the sum of long-term debt (Compustat annual item DLTT), debt in current liabilities (item DLC), carrying value of preferred stock (item PSTK), and preferred dividends in arrears (item DVPA, zero if missing), less preferred treasury stock (item TSTKP, zero if missing). We measure financial assets as cash and short-term investments (item CHE). Book equity is common equity (item CEQ) plus any preferred treasury stock (item TSTKP, zero if missing) less any pre- ferred dividends in arrears (item DVPA, zero if missing). Market equity is the number of common shares outstanding times share price (from CRSP). +At the end of June of each year t, we sort stocks into deciles based on Ebp, and separately, on Ndp, for the fiscal year ending in calendar year t - 1. Market equity is measured at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Ebp and Ndp. When forming the Ebp portfolios, we exclude firms with non-positive book or market value of net operating assets. For the Ndp portfolios, we exclude firms with non-positive net debt. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.2.3,Bmq12,Quarterly Book-to-market Equity (12-month holding period),"Rosenberg, Reid, and Lanstein ",1985,Value-versus-growth,,,,,,,p70 +A.2.9,Ep,Earnings-to-price,Basu,1983,Value-versus-growth,data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'],"data_rawq['ibq4'] = ttm4('ibq', data_rawq) +* 'me' from rawq +",1,0,1,0,"At the end of June of each year t, we split stocks into deciles based on earnings-to-price, Ep, which is income before extraordinary items (Compustat annual item IB) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before com- puting Ep. Firms with non-positive earnings are excluded. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.1,Aci,Abnormal Corporate Investment,"Titman, Wei, and Xie",2004,Investment,data_rawa['aci'] = data_rawa['ce']/ (data_rawa['ce1']+data_rawa['ce2']+data_rawa['ce3'])-1,"data_rawa['ce'] = data_rawa['capx'] / data_rawa['sale'] +data_rawa['ce1'] = data_rawa['ce'].shift(1) +data_rawa['ce2'] = data_rawa['ce'].shift(2) +data_rawa['ce3'] = data_rawa['ce'].shift(3)",0,1,0,0,"At the end of June of year t, we measure abnormal corporate investment, Aci, as Cet-1/[(Cet-2 + Cet-3 + Cet-4)/3] - 1, in which Cet-j is capital expenditure (Compustat annual item CAPX) scaled by sales (item SALE) for the fiscal year ending in calendar year t - j. The last three-year average capital expenditure is designed to project the benchmark investment in the portfolio formation year. We exclude firms with sales less than ten million dollars. At the end of June of each year t, we sort stocks into deciles based on Aci. Monthly decile returns are computed from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.10,Nsi,Net Stock Issues,Pontiff and Woodgate,2008,Investment,data_rawa['nsi'] = np.log(data_rawa['sps']/data_rawa['sps_l1']),"data_rawa['sps'] = data_rawa['csho'] * data_rawa['ajex'] +data_rawa['sps_l1'] = data_rawa.groupby('permno')['sps'].shift(1)",0,1,0,0,"At the end of June of year t, we measure net stock issues, Nsi, as the natural log of the ratio of the split-adjusted shares outstanding at the fiscal year ending in calendar year t-1 to the split-adjusted shares outstanding at the fiscal year ending in t-2. The split-adjusted shares outstanding is shares outstanding (Compustat annual item CSHO) times the adjustment factor (item AJEX). At the end of June of each year t, we sort stocks with negative Nsi into two portfolios (1 and 2), stocks with zero Nsi into one portfolio (3), and stocks with positive Nsi into seven portfolios (4 to 10). Monthly decile returns are from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.11,dIi,% Change in Investment - % Change in Industry Investment,Abarbanell and Bushee,1998,Investment,data_rawa['dIi'] = data_rawa['dinvt'] - data_rawa['dind'],"data_rawa['e_invt'] = (data_rawa['capxv'] + data_rawa['capxv'].shift(1))/2 +data_rawa['dinvt'] = (data_rawa['capxv'] - data_rawa['e_invt']) / data_rawa['e_invt'] + +data_rawa['ind'] = data_rawa['capxv'] +s = data_rawa.groupby(['jdate', 'sic2'])['ind'].sum() +data_rawa = pd.merge(data_rawa, s, on=['jdate', 'sic2']) +# new industry investment will be named as ind_y, cause it's been grouped by ind +data_rawa['e_ind'] = (data_rawa['ind_y'] + data_rawa['ind_y'].shift(1))/2 +data_rawa['dind'] = (data_rawa['ind_y']-data_rawa['e_ind']) / data_rawa['e_ind']",0,1,0,0,"Following Abarbanell and Bushee (1998), we define the %d(.) operator as the percentage change in the variable in the parentheses from its average over the prior two years, e.g., %d(Investment) = [Investment(t) - E[Investment(t)]]/E[Investment(t)], in which E[Investment(t)] = [Investment(t-1) + Investment(t - 2)]/2. dIi is defined as %d(Investment) - %d(Industry investment), in which investment is capital expenditure in property, plant, and equipment (Compustat annual item CAPXV). Industry investment is the aggregate investment across all firms with the same two- digit SIC code. Firms with non-positive E[Investment(t)] are excluded and we require at least two firms in each industry. At the end of June of each year t, we sort stocks into deciles based on dIi for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.14,Ivg,Inventory Growth,Belo and Lin,2011,Investment,,,,,,,"At the end of June of each year t, we sort stocks into deciles based on inventory growth, Ivg, which is the annual growth rate in inventory (Compustat annual item INVT) from the fiscal year ending in calendar year t - 2 to the fiscal year ending in t - 1. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.15,Ivc,Inventory Changes,Thomas and Zhang,2002,Investment,data_rawa['ivc'] = data_rawa['invt'] / data_rawa['atAvg'],"data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1) +data_rawa['atAvg'] = (data_rawa['at']+data_rawa['at_l1'])/2",1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on inventory changes, Ivc, which is the annual change in inventory (Compustat annual item INVT) scaled by the average of total assets (item AT) for the fiscal years ending in t - 2 and t - 1. We exclude firms that carry no inventory for the past two fiscal years. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.16,Oa(acc),Operating Accruals,Sloan,1996,Investment,"data_rawq['acc'] = np.select(condlist, choicelist, + default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])- + (data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/(10*data_rawq['beq'])) +","#prepare be +data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) +data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) +# acc +data_rawq['actq_l4'] = data_rawq.groupby(['permno'])['actq'].shift(4) +data_rawq['lctq_l4'] = data_rawq.groupby(['permno'])['lctq'].shift(4) +data_rawq['npq_l4'] = data_rawq.groupby(['permno'])['npq'].shift(4) +condlist = [data_rawq['npq'].isnull(), + data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] +choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/(10*data_rawq['beq']), + np.nan]",0,0,1,0,"Prior to 1988, we use the balance sheet approach in Sloan (1996) to measure operating accruals, Oa, as changes in noncash working capital minus depreciation, in which the noncash working capital is changes in noncash current assets minus changes in current liabilities less short-term debt and taxes payable. In particular, Oa equals (dCA-dCASH)-(dCL-dSTD-dTP)-DP, in which dCA is the change in current assets (Compustat annual item ACT), dCASH is the change in cash or cash equiv- alents (item CHE), dCL is the change in current liabilities (item LCT), dSTD is the change in debt included in current liabilities (item DLC), dTP is the change in income taxes payable (item TXP), and DP is depreciation and amortization (item DP). Missing changes in income taxes payable are set to zero. Starting from 1988, we follow Hribar and Collins (2002) to measure Oa using the state- ment of cash flows as net income (item NI) minus net cash flow from operations (item OANCF). Doing so helps mitigate measurement errors that can arise from nonoperating activities such as ac- quisitions and divestitures. Data from the statement of cash flows are only available since 1988. At the end of June of each year t, we sort stocks into deciles on Oa for the fiscal year ending in calendar year t - 1 scaled by total assets (item AT) for the fiscal year ending in t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. +" +A.3.17,Ta,Total Accruals,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['ta'] = data_rawa['dwc'] + data_rawa['dnco'] + data_rawa['dfin'],"#dwc +data_rawa['dwc'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc']) +* dnco +* dfin +",0,1,0,0,"Prior to 1988, we use the balance sheet approach in Richardson, Sloan, Soliman, and Tuna (2005) to measure total accruals, Ta, as dWc + dNco + dFin. dWc is the change in net non-cash working capital. Net non-cash working capital is current operating asset (Coa) minus current operating liabilities (Col), with Coa = current assets (Compustat annual item ACT) - cash and short-term investments (item CHE) and Col = current liabilities (item LCT) - debt in current liabilities (item DLC). dNco is the change in net non-current operating assets. Net non-current operating assets are non-current operating assets (Nca) minus non-current operating liabilities (Ncl), with Nca = total assets (item AT) - current assets - long-term investments (item IVAO), and Ncl = total liabilities (item LT) - current liabilities - long-term debt (item DLTT). dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (item IVST) + long-term investments, and Fnl = long-term debt + debt in current liabilities + preferred stocks (item PSTK). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero. +Starting from 1988, we use the cash flow approach to measure Ta as net income (item NI) minus total operating, investing, and financing cash flows (items OANCF, IVNCF, and FINCF) plus sales of stocks (item SSTK, zero if missing) minus stock repurchases and dividends (items PRSTKC and DV, zero if missing). Data from the statement of cash flows are only available since 1988. At the end of June of each year t, we sort stocks into deciles based on Ta for the fiscal year ending in calendar year t - 1 scaled by total assets for the fiscal year ending in t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.18,dCoa,changes in Current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dcoa'] = (data_rawa['coa']-data_rawa['coa'].shift(1)) / data_rawa['at'].shift(1),"# dCoa +data_rawa['coa'] = data_rawa['act'] - data_rawa['che']",0,1,0,0,"Richardson, Sloan, Soliman, and Tuna (2005, Table 10) show that several components of total accruals also forecast returns in the cross section. dWc is the change in net non-cash working capital. Net non-cash working capital is current operating asset (Coa) minus current operating liabilities (Col), with Coa = current assets (Compustat annual item ACT) - cash and short term investments (item CHE) and Col = current liabilities (item LCT) - debt in current liabilities (item DLC). dCoa is the change in current operating asset and dCol is the change in current operating liabilities. Missing changes in debt in current liabilities are set to zero. At the end of June of each year t, we sort stocks into deciles based, separately, on dWc, dCoa, and dCol for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.19,dNca,changes in Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dnca'] = data_rawa['nco'] - data_rawa['nco'].shift(1),"# dNca +data_rawa['ivao_0'] = np.where(data_rawa['ivao'].isnull(), 0, data_rawa['ivao']) +data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) +data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao_0'] +data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt_0'] +data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] +",0,1,0,0,"dNco is the change in net non-current operating assets. Net non-current operating assets are non- current operating assets (Nca) minus non-current operating liabilities (Ncl), with Nca = total assets (Compustat annual item AT) - current assets (item ACT) - long-term investments (item IVAO), and Ncl = total liabilities (item LT) - current liabilities (item LCT) - long-term debt (item DLTT). dNca is the change in non-current operating assets and dNcl is the change in non-current operating liabilities. Missing changes in long-term investments and long-term debt are set to zero. At the end of June of each year t, we sort stocks into deciles based, separately, on dNco, dNca, and dNcl for the fiscal year ending in calendar year t - 1, all scaled by total assets for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.19,dNco,Changes in Net Non-current Operating Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dnco'] = data_rawa['nco'] - data_rawa['nco'].shift(1),"# dNco +data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao'] +data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt'] +data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl']",0,1,0,0,"dNco is the change in net non-current operating assets. Net non-current operating assets are non- current operating assets (Nca) minus non-current operating liabilities (Ncl), with Nca = total assets (Compustat annual item AT) - current assets (item ACT) - long-term investments (item IVAO), and Ncl = total liabilities (item LT) - current liabilities (item LCT) - long-term debt (item DLTT). dNca is the change in non-current operating assets and dNcl is the change in non-current operating liabilities. Missing changes in long-term investments and long-term debt are set to zero. At the end of June of each year t, we sort stocks into deciles based, separately, on dNco, dNca, and dNcl for the fiscal year ending in calendar year t - 1, all scaled by total assets for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.2,I/A, Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment,data_rawa['ia'] = (data_rawa['at']/data_rawa['at_l1'])-1,data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1),1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on investment-to-assets, I/A, which is measured as total assets (Compustat annual item AT) for the fiscal year ending in calendar year t-1 divided by total assets for the fiscal year ending in t-2 minus one. Monthly decile returns are computed from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.20,dBe,Changes in Net Financial Assets,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dBe'] = (data_rawa['ceq'] - data_rawa['ceq'].shift(1)) / data_rawa['at'].shift(1),,0,1,0,0,"dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (Compustat annual item IVST) + long-term investments (item IVAO), and Fnl = long-term debt (item DLTT) + debt in current liabilities (item DLC) + preferred stock (item PSTK). dSti is the change in short-term investments, dLti is the change in long-term investments, and dFnl is the change in financial liabilities. dBe is the change in book equity (item CEQ). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero (at least one change has to be non-missing when constructing any variable). When constructing dSti (dLti), we exclude firms that do not have long-term (short-term) investments in the past two fiscal years. At the end of June of each year t, we sort stocks into deciles based, separately, on dFin, dSti, dLti, dFnl, and dBe for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.20,dFin,changes in Financial Liabilities,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dfin'] = data_rawa['dfin'] / data_rawa['at'].shift(1),"data_rawa['fna'] = data_rawa['ivst'] + data_rawa['ivao'] +data_rawa['fnl'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + +data_rawa['d_dlc'] = data_rawa['dlc'] - data_rawa['dlc'].shift(1) +data_rawa['d_dlc'] = np.where(data_rawa['d_dlc'].isnull(), 0, data_rawa['d_dlc']) +data_rawa['d_pstk'] = data_rawa['pstk'] - data_rawa['pstk'].shift(1) +data_rawa['d_pstk'] = np.where(data_rawa['d_pstk'].isnull(), 0, data_rawa['d_pstk']) + +data_rawa['dfnl'] = (data_rawa['dltt']-data_rawa['dltt'].shift(1)) + data_rawa['d_dlc'] + data_rawa['d_pstk'] + +data_rawa['d_ivst'] = data_rawa['ivst'] - data_rawa['ivst'].shift(1) +data_rawa['d_ivst'] = np.where(data_rawa['d_ivst'].isnull(), 0, data_rawa['d_ivst']) +data_rawa['d_ivao'] = data_rawa['ivao'] - data_rawa['ivao'].shift(1) +data_rawa['d_ivao'] = np.where(data_rawa['d_ivao'].isnull(), 0, data_rawa['d_ivao']) + +data_rawa['dfna'] = data_rawa['d_ivst'] + data_rawa['d_ivao'] +data_rawa['dfin'] = data_rawa['dfna'] - data_rawa['dfnl']",0,1,0,0,"dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (Compustat annual item IVST) + long-term investments (item IVAO), and Fnl = long-term debt (item DLTT) + debt in current liabilities (item DLC) + preferred stock (item PSTK). dSti is the change in short-term investments, dLti is the change in long-term investments, and dFnl is the change in financial liabilities. dBe is the change in book equity (item CEQ). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero (at least one change has to be non-missing when constructing any variable). When constructing dSti (dLti), we exclude firms that do not have long-term (short-term) investments in the past two fiscal years. At the end of June of each year t, we sort stocks into deciles based, separately, on dFin, dSti, dLti, dFnl, and dBe for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.20,dFnl,changes in Book Equity,"Richardson, Sloan, Soliman, and Tuna",2005,Investment,data_rawa['dfnl'] = data_rawa['dfnl'] / data_rawa['at'].shift(1),* dfnl in dFin,0,1,0,0,"dFin is the change in net financial assets. Net financial assets are financial assets (Fna) minus financial liabilities (Fnl), with Fna = short-term investments (Compustat annual item IVST) + long-term investments (item IVAO), and Fnl = long-term debt (item DLTT) + debt in current liabilities (item DLC) + preferred stock (item PSTK). dSti is the change in short-term investments, dLti is the change in long-term investments, and dFnl is the change in financial liabilities. dBe is the change in book equity (item CEQ). Missing changes in debt in current liabilities, long-term investments, long-term debt, short-term investments, and preferred stocks are set to zero (at least one change has to be non-missing when constructing any variable). When constructing dSti (dLti), we exclude firms that do not have long-term (short-term) investments in the past two fiscal years. At the end of June of each year t, we sort stocks into deciles based, separately, on dFin, dSti, dLti, dFnl, and dBe for the fiscal year ending in calendar year t - 1, all scaled by total assets (item AT) for the fiscal year ending in calendar year t - 2. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.22,Poa,Percent operating accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment,data_rawa['poa'] = data_rawa['oa']/data_rawa['ni'],* oa(acc),0,1,0,0,"Accruals are traditionally scaled by total assets. Hafzalla, Lundholm, and Van Winkle (2011) show that scaling accruals by the absolute value of earnings (percent accruals) is more effective in se- lecting firms for which the differences between sophisticated and naive forecasts of earnings are the most extreme. To construct the percent operating accruals (Poa) deciles, at the end of June of each year t, we sort stocks into deciles based on operating accruals scaled by the absolute value of net income (Compustat annual item NI) for the fiscal year ending in calendar year t - 1. See Appendix A.3.16 for the measurement of operating accruals. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.23,Pta,Percent total accruals,"Hafzalla, Lundholm, and Van Winkle",2011,Investment,,,,,,,"At the end of June of each year t, we sort stocks into deciles on percent total accruals, Pta, cal- culated as total accruals scaled by the absolute value of net income (Compustat annual item NI) for the fiscal year ending in calendar year t - 1. See Appendix A.3.17 for the measurement of total accruals. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of year t + 1." +A.3.24,Pda,Percent discretionary accruals,,,Investment,,,,,,,"At the end of June of each year t, we split stocks into deciles based on percent discretionary accruals, Pda, calculated as the discretionary accruals, Dac, for the fiscal year ending in calendar year t - 1 multiplied with total assets (Compustat annual item AT) for the fiscal year ending in t - 2 scaled by the absolute value of net income (item NI) for the fiscal year ending in t - 1. See Appendix A.3.21 for the measurement of discretionary accruals. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.25,Ndf,Net debt finance,"Bradshaw, Richardson, and Sloan",2006,Investment,data_rawa['ndf'] = data_rawa['dltis'] - data_rawa['dltr'] + data_rawa['dlcch'] ,,0,1,0,0,"Net external financing, Nxf, is the sum of net equity financing, Nef, and net debt financing, Ndf (Bradshaw, Richardson, and Sloan 2006). Nef is the proceeds from the sale of common and pre- ferred stocks (Compustat annual item SSTK) less cash payments for the repurchases of common and preferred stocks (item PRSTKC) less cash payments for dividends (item DV). Ndf is the cash proceeds from the issuance of long-term debt (item DLTIS) less cash payments for long-term debt reductions (item DLTR) plus the net changes in current debt (item DLCCH, zero if missing). At the end of June of each year t, we sort stocks into deciles based on Nxf, and, separately, on Nef and Ndf, for the fiscal year ending in calendar year t - 1 scaled by the average of total assets for fiscal years ending in t - 2 and t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the data on financing activities start in 1971, the portfolios start in July 1972." +A.3.3,"Ia q 6, and Ia q 12",Quarterly Investment-to-assets,"Cooper, Gulen, and Schill",2008,Investment,data_rawq['iaq'] = (data_rawq['atq']/data_rawq['atqlag'])-1,"data_rawq['atqlag'] = ttm4('atq',data_rawq)",0,0,1,0,"Quarterly investment-to-assets, Iaq, is defined as quarterly total assets (Compustat quarterly item ATQ) divided by four-quarter-lagged total assets minus one. At the beginning of each month t, we sort stocks into deciles based on Iaq for the latest fiscal quarter ending at least four months ago. Monthly decile returns are calculated for the current month t (Iaq1), from month t to t + 5 (Iaq6), and from month t to t + 11 (Iaq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Iaq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Iaq6 decile." +A.3.4,dPia,Changes in PPE and Inventory-to-assets,"Lyandres, Sun, and Zhang",2008,Investment,data_rawa['dpia'] = (data_rawa['c_propty'] + data_rawa['c_invt']) / data_rawa['at'].shift(1),"data_rawa['c_propty'] = data_rawa['ppegt'] - data_rawa['ppegt'].shift(1) +data_rawa['c_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1)",0,1,0,0,"Changes in PPE and Inventory-to-assets, dPia, is defined as the annual change in gross property, plant, and equipment (Compustat annual item PPEGT) plus the annual change in inventory (item INVT) scaled by one-year-lagged total assets (item AT). At the end of June of each year t, we sort stocks into deciles based on dPia for the fiscal year ending in calendar year t-1. Monthly decile re- turns are computed from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.3.5,Noa and dNoa,(Changes in) Net Operating Assets,"Hirshleifer, Hou, Teoh, and Zhang",2004,Investment,"data_rawq['noa'] = (data_rawq['atq']-data_rawq['cheq']-data_rawq['ivaoq'])-\ + (data_rawq['atq']-data_rawq['dlcq']-data_rawq['dlttq']-data_rawq['mibq']-data_rawq['pstkq']-data_rawq['ceqq'])/data_rawq['atq_l4'] +data_rawa['dnoa'] = (data_rawa['net_op']-data_rawa['net_op'].shift(1))/ data_rawa['at'].shift(1) +","#noa +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) +data_rawq['ivaoq'] = np.where(data_rawq['ivaoq'].isnull(), 0, 1) +data_rawq['dlcq'] = np.where(data_rawq['dlcq'].isnull(), 0, 1) +data_rawq['dlttq'] = np.where(data_rawq['dlttq'].isnull(), 0, 1) +data_rawq['mibq'] = np.where(data_rawq['mibq'].isnull(), 0, 1) +data_rawq['pstkq'] = np.where(data_rawq['pstkq'].isnull(), 0, 1) +# dNoa +data_rawa['dlc_0'] = np.where(data_rawa['dlc'].isnull(), 0, data_rawa['dlc']) +data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) +data_rawa['mib_0'] = np.where(data_rawa['mib'].isnull(), 0, data_rawa['mib']) +data_rawa['pstk_0'] = np.where(data_rawa['pstk'].isnull(), 0, data_rawa['pstk']) + +data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] +data_rawa['op_lia'] = data_rawa['at'] - data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] +data_rawa['net_op'] = data_rawa['op_at'] - data_rawa['op_lia']",1,1,1,0,"Following Hirshleifer, Hou, Teoh, and Zhang (2004), we measure net operating assets as operating assets minus operating liabilities. Operating assets are total assets (Compustat annual item AT) minus cash and short-term investment (item CHE). Operating liabilities are total assets minus debt included in current liabilities (item DLC, zero if missing), minus long-term debt (item DLTT, zero if missing), minus minority interests (item MIB, zero if missing), minus preferred stocks (item PSTK, zero if missing), and minus common equity (item CEQ). Noa is net operating assets scalded by one-year-lagged total assets. Changes in net operating assets, dNoa, is the annual change in net operating assets scaled by one-year-lagged total assets. At the end of June of each year t, we sort stocks into deciles based on Noa, and separately, on dNOA, for the fiscal year ending in calendar year t - 1. Monthly decile returns are computed from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.6,dLno,Changes in Long-term Net Operating Assets,"Fairfield, Whisenant, and Yohn",2003,Investment,"data_rawa['dlno'] = (data_rawa['ppent']-data_rawa['ppent'].shift(1)) + (data_rawa['intan']-data_rawa['intan'].shift(1)) + (data_rawa['ao']-data_rawa['ao'].shift(1)) - (data_rawa['lo']-data_rawa['lo'].shift(1)) + data_rawa['dp'] +* +data_rawa['dlno'] = data_rawa['dlno'] / data_rawa['avg_at']","* +avg_at = [] +for i in range(data_rawa.shape[0]): + avg_at.append(data_rawa.loc[0:i, 'at'].mean()) +data_rawa['avg_at'] = pd.DataFrame(avg_at)",0,1,0,0,"Following Fairfield, Whisenant, and Yohn (2003), we measure changes in long-term net operating assets as the annual change in net property, plant, and equipment (Compustat item PPENT) plus the change in intangibles (item INTAN) plus the change in other long-term assets (item AO) minus the change in other long-term liabilities (item LO) and plus depreciation and amortization expense (item DP). dLno is the change in long-term net operating assets scaled by the average of total assets (item AT) from the current and prior years. At the end of June of each year t, we sort stocks into deciles based on dLno for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.7,Ig,Investment Growth,Xing,2008,Investment,data_rawa['ig'] = data_rawa['capx']/data_rawa['capx_l1'],data_rawa['capx_l1'] = data_rawa.groupby('permno')['capx'].shift(1),1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on investment growth, Ig, which is the growth rate in capital expenditure (Compustat annual item CAPX) from the fiscal year ending in calendar year t - 2 to the fiscal year ending in t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.3.8,2Ig,2-year Investment Growth,Anderson and Garcia-Feijoo,2006,Investment,data_rawa['2ig'] = data_rawa['capx']/data_rawa['capx_l2'],data_rawa['capx_l2'] = data_rawa.groupby('permno')['capx'].shift(2),0,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on two-year investment growth, 2Ig, which is the growth rate in capital expenditure (Compustat annual item CAPX) from the fiscal year ending in calendar year t - 3 to the fiscal year ending in t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.1,"Roe1, Roe6",Return on Equity,"Hou, Xue, and Zhang",2015,Profitability,data_rawq['roe'] = data_rawq['ibq']/data_rawq['ceqq_l1'],data_rawq['ceqq_l1'] = data_rawq.groupby(['permno'])['ceqq'].shift(1),1,0,1,0,"Return on equity, Roe, is income before extraordinary items (Compustat quarterly item IBQ) di- vided by one-quarter-lagged book equity (Hou, Xue, and Zhang 2015). Book equity is shareholders' equity, plus balance sheet deferred taxes and investment tax credit (item TXDITCQ) if available, minus the book value of preferred stock (item PSTKQ). Depending on availability, we use stockhold- ers' equity (item SEQQ), or common equity (item CEQQ) plus the book value of preferred stock, or total assets (item ATQ) minus total liabilities (item LTQ) in that order as shareholders' equity." +A.4.11,"Gla q 1, Gla q 6, and Gla q 12",Quarterly Gross Profits-to-lagged Assets,,,Profitability,,,,,,,"Glaq, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ) divided by one-quarter-lagged total assets (item ATQ). At the beginning of each month t, we sort stocks into deciles based on Glaq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Glaq1), from month t to t+5 (Glaq6), and from month t to t + 11 (Glaq12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Glaq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Glaq6 decile. For sufficient data coverage, the Glaq portfolios start in January 1976." +A.4.12,Ope(operprof),Operating Profits to Equity,Fama and French,2015,Profitability,data_rawa['operprof'] = (data_rawa['revt']-data_rawa['cogs']-data_rawa['xsga0']-data_rawa['xint0'])/data_rawa['ceq_l1'],"data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) +data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) +data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga'])",0,1,0,0,"Following Fama and French (2015), we measure operating profitability to equity, Ope, as total rev- enue (Compustat annual item REVT) minus cost of goods sold (item COGS, zero if missing), minus selling, general, and administrative expenses (item XSGA, zero if missing), and minus interest ex- pense (item XINT, zero if missing), scaled by book equity (the denominator is current, not lagged, book equity). We require at least one of the three expense items (COGS, XSGA, and XINT) to be non-missing. Book equity is stockholders' book equity, plus balance sheet deferred taxes and investment tax credit (item TXDITC) if available, minus the book value of preferred stock. Stock- holders' equity is the value reported by Compustat (item SEQ), if it is available. If not, we measure stockholders' equity as the book value of common equity (item CEQ) plus the par value of preferred stock (item PSTK), or the book value of assets (item AT) minus total liabilities (item LT). Depend- ing on availability, we use redemption (item PSTKRV), liquidating (item PSTKL), or par value (item PSTK) for the book value of preferred stock. At the end of June of each year t, we sort stocks into deciles based on Ope for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.14,"Ole q 1, Ole q 6 ",Quarterly Operating Profits-to-lagged Equity,,,Profitability,,,,,,,"Quarterly operating profits-to-lagged equity, Oleq, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ, zero if missing), minus selling, general, and administrative expenses (item XSGAQ, zero if missing), and minus interest expense (item XINTQ, zero if missing), scaled by one-quarter-lagged book equity. We require at least one of the three expense items (COGSQ, XSGAQ, and XINTQ) to be non-missing. Book equity is shareholders' equity, plus balance sheet deferred taxes and investment tax credit (item TXDITCQ) if available, minus the book value of preferred stock (item PSTKQ). Depending on availability, we use stockhold- ers' equity (item SEQQ), or common equity (item CEQQ) plus the book value of preferred stock, or total assets (item ATQ) minus total liabilities (item LTQ) in that order as shareholders' equity. +At the beginning of each month t, we split stocks on Oleq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Oleq 1), from month t to t + 5 (Oleq6), and from month t to t + 11 (Oleq12). The deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Oleq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Oleq6 decile. For sufficient data coverage, the Oleq portfolios start in January 1972." +A.4.15,Opa,Operating Profits-to-assets,"Linnainmaa, and Nikolaev",2015,Profitability,,,,,,,"Following Ball, Gerakos, Linnainmaa, and Nikolaev (2015), we measure operating profits-to-assets, Opa, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS), minus selling, general, and administrative expenses (item XSGA), and plus research and develop- ment expenditures (item XRD, zero if missing), scaled by book assets (item AT, the denominator is current, not lagged, total assets). At the end of June of each year t, we sort stocks into deciles based on Opa for the fiscal year ending in calendar year t-1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.17,"Ola q 1, Ola q 6, and Ola q 12",Quarterly Operating Profits-to-lagged Assets,,,Profitability,,,,,,,"Quarterly operating profits-to-lagged assets, Olaq, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ), minus selling, general, and administra- tive expenses (item XSGAQ), plus research and development expenditures (item XRDQ, zero if missing), scaled by one-quarter-lagged book assets (item ATQ). At the beginning of each month t, we sort stocks into deciles based on Olaq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Olaq1), from month t to t+5 (Olaq6), and from month t to t + 11 (Olaq12). The deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Olaq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Olaq6 decile. For sufficient data coverage, the Olaq portfolios start in January 1976." +A.4.18,Cop,Cash-based Operating Profitability,"Gerakos, Linnainmaa, and Nikolaev",2016,Profitability,"data_rawa['cop'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ + - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ + + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] +data_rawa['cop'] = data_rawa['cop'] / data_rawa['at'] ",* Cla,0,1,0,0,"Following Ball, Gerakos, Linnainmaa, and Nikolaev (2016), we measure cash-based operating prof- itability, Cop, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS), minus selling, general, and administrative expenses (item XSGA), plus research and de- velopment expenditures (item XRD, zero if missing), minus change in accounts receivable (item RECT), minus change in inventory (item INVT), minus change in prepaid expenses (item XPP), plus change in deferred revenue (item DRC plus item DRLT), plus change in trade accounts payable (item AP), and plus change in accrued expenses (item XACC), all scaled by book assets (item AT, the denominator is current, not lagged, total assets). All changes are annual changes in balance sheet items and we set missing changes to zero. At the end of June of each year t, we sort stocks into deciles based on Cop for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.19,Cla,Cash-based Operating Profits-to-lagged Assets,,,Profitability,"data_rawa['cla'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ + - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ + + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] +data_rawa['cla'] = data_rawa['cla'] / data_rawa['at'].shift(1)","data_rawa['d_rect'] = data_rawa['rect'] - data_rawa['rect'].shift(1) +data_rawa['d_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) +data_rawa['d_xpp'] = data_rawa['xpp'] - data_rawa['xpp'].shift(1) +data_rawa['d_dr'] = (data_rawa['drc']-data_rawa['drc'].shift(1)) + (data_rawa['drlt']-data_rawa['drlt'].shift(1)) +data_rawa['d_ap'] = data_rawa['ap'] - data_rawa['ap'].shift(1) +data_rawa['d_xacc'] = data_rawa['xacc'] - data_rawa['xacc'].shift(1) + +data_rawa['xrd_0'] = np.where(data_rawa['xrd'].isnull(), 0, data_rawa['xrd']) +data_rawa['d_rect_0'] = np.where(data_rawa['d_rect'].isnull(), 0, data_rawa['d_rect']) +data_rawa['d_invt_0'] = np.where(data_rawa['d_invt'].isnull(), 0, data_rawa['d_invt']) +data_rawa['d_xpp_0'] = np.where(data_rawa['d_xpp'].isnull(), 0, data_rawa['d_xpp']) +data_rawa['d_dr_0'] = np.where(data_rawa['d_dr'].isnull(), 0, data_rawa['d_dr']) +data_rawa['d_ap_0'] = np.where(data_rawa['d_ap'].isnull(), 0, data_rawa['d_ap']) +data_rawa['d_xacc_0'] = np.where(data_rawa['d_xacc'].isnull(), 0, data_rawa['d_xacc'])",0,1,0,0,"Cash-based operating profits-to-lagged assets, Cla, is total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS), minus selling, general, and administrative expenses (item XSGA), plus research and development expenditures (item XRD, zero if missing), minus change in accounts receivable (item RECT), minus change in inventory (item INVT), minus change in prepaid expenses (item XPP), plus change in deferred revenue (item DRC plus item DRLT), plus change in trade accounts payable (item AP), and plus change in accrued expenses (item XACC), all scaled by one-year-lagged book assets (item AT). All changes are annual changes in balance sheet items and we set missing changes to zero. At the end of June of each year t, we sort stocks into deciles based on Cla for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.2,"dRoe1, dRoe6, and dRoe12",Changes in Return on Equity,"Hou, Xue, and Zhang",2015,Profitability,,,,,,,"Change in return on equity, dRoe, is return on equity minus its value from four quarters ago. See Appendix A.4.1 for the measurement of return on equity. At the beginning of each month t, we sort all stocks into deciles on their most recent past dRoe. Before 1972, we use the most recent dRoe with quarterly earnings from fiscal quarters ending at least four months ago. Starting from 1972, we use dRoe computed with quarterly earnings from the most recent quarterly earnings announcement dates (Compustat quarterly item RDQ). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent dRoe to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for the current month t (dRoe1), from month t to t + 5 (dRoe6), and from month t to t + 11 (dRoe12). The deciles are rebalanced monthly. The holding period that is longer than one month as in, for instance, dRoe6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdeciles returns as the monthly return of the dRoe6 decile." +A.4.20,Claq,Quarterly Cash-based Operating Profits-to-lagged Assets,,,Profitability,,,,,,,"Quarterly cash-based operating profits-to-lagged assets, Cla, is quarterly total revenue (Compustat quarterly item REVTQ) minus cost of goods sold (item COGSQ), minus selling, general, and ad- ministrative expenses (item XSGAQ), plus research and development expenditures (item XRDQ, zero if missing), minus change in accounts receivable (item RECTQ), minus change in inventory (item INVTQ), plus change in deferred revenue (item DRCQ plus item DRLTQ), and plus change in trade accounts payable (item APQ), all scaled by one-quarter-lagged book assets (item ATQ). All changes are quarterly changes in balance sheet items and we set missing changes to zero. At the beginning of each month t, we split stocks on Claq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for month t (Claq1), from month t to t + 5 (Claq6), and from month t to t + 11 (Claq12). The deciles are rebalanced at the beginning of t + 1. The holding period longer than one month as in Claq6 means that for a given decile in each month there exist six subdeciles, each initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Claq6 decile. For sufficient data coverage, the Claq portfolios start in January 1976." +A.4.29,Tbi q 12,Quarterly Taxable Income-to-book Income,"Green, Hand, and Zhang",2013,Profitability,,,,,,,"Quarterly taxable income-to-book income, Tbiq, is quarterly pretax income (Compustat quarterly item PIQ) divided by net income (NIQ). At the beginning of each month t, we split stocks into deciles based on Tbiq calculated with accounting data from the fiscal quarter ending at least four months ago. We exclude firms with non-positive pretax income or net income. We calculate monthly decile returns for the current month t (Tbiq1), from month t to t + 5 (Tbiq6), and from month t to t+11 (Tbiq12). The deciles are rebalanced at the beginning of month t+1. The holding period that is longer than one month as in, for instance, Tbiq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Tbiq6 decile." +A.4.3,Roa1,Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability,data_rawq['roa'] = data_rawq['ibq']/data_rawq['atq_l1'],data_rawq['atq_l1'] = data_rawq.groupby(['permno'])['atq'].shift(1),1,0,1,0,"Return on assets, Roa, is income before extraordinary items (Compustat quarterly item IBQ) di- vided by one-quarter-lagged total assets (item ATQ). At the beginning of each month t, we sort all stocks into deciles based on Roa computed with quarterly earnings from the most recent earnings announcement dates (item RDQ). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent Roa to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corre- sponding fiscal quarter end. Monthly decile returns are calculated for month t (Roa1), from month t to t+5 (Roe6), and from month t to t+11 (Roe12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Roa6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdeciles returns as the monthly return of the Roa6 decile. For sufficient data coverage, the Roa portfolios start in January 1972." +A.4.4,"dRoa1, dRoa6",Changes in Return on Assets,"Balakrishnan, Bartov, and Faurel",2010,Profitability,,,,,,,"Change in return on assets, dRoa, is return on assets minus its value from four quarters ago. See Appendix A.4.3 for the measurement of return on assets. At the beginning of each month t, we sort all stocks into deciles based on dRoa computed with quarterly earnings from the most recent earnings announcement dates (Compustat quarterly item RDQ). For a firm to enter the portfo- lio formation, we require the end of the fiscal quarter that corresponds to its most recent dRoa to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale earnings information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for month t (dRoa1), from month t to t + 5 (dRoa6), and from month t to t + 11 (dRoa12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, dRoa6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the dRoa6 decile. For sufficient data coverage, the dRoa portfolios start in January 1973." +A.4.5,Ato,Asset Turnover,Soliman,2008,Profitability,data_rawq['ato'] = data_rawq['saleq']/data_rawq['noa_l4'],"* noa +* noa_l4 from rna",1,0,1,0,"Soliman (2008) use DuPont analysis to decompose Roe as Rna + FLEV * SPREAD, in which Roe is return on equity, Rna is return on net operating assets, FLEV is financial leverage, and SPREAD is the difference between return on net operating assets and borrowing costs. We can further decompose Rna as Pm * Ato, in which Pm is profit margin and Ato is asset turnover. +Following Soliman (2008), we use annual sorts to form Rna, Pm, and Ato deciles. At the end of June of year t, we measure Rna as operating income after depreciation (Compustat annual item OIADP) for the fiscal year ending in calendar year t - 1 divided by net operating assets (Noa) for the fiscal year ending in t - 2. Noa is operating assets minus operating liabilities. Operating assets are total assets (item AT) minus cash and short-term investment (item CHE), and minus other investment and advances (item IVAO, zero if missing). Operating liabilities are total assets minus debt in current liabilities (item DLC, zero if missing), minus long-term debt (item DLTT, zero if missing), minus minority interests (item MIB, zero if missing), minus preferred stocks (item PSTK, zero if missing), and minus common equity (item CEQ). Pm is operating income after depreciation divided by sales (item SALE) for the fiscal year ending in calendar year t - 1. Ato is sales for the fiscal year ending in calendar year t - 1 divided by Noa for the fiscal year ending in t - 2. At the end of June of each year t, we sort stocks into three sets of deciles based on Rna, Pm, and Ato. We exclude firms with non-positive Noa for the fiscal year ending in calendar year t - 2 when forming the Rna and the Ato portfolios. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.5,pm,profit margin,Soliman,2008,Profitability,,,,,,,"Soliman (2008) use DuPont analysis to decompose Roe as Rna + FLEV * SPREAD, in which Roe is return on equity, Rna is return on net operating assets, FLEV is financial leverage, and SPREAD is the difference between return on net operating assets and borrowing costs. We can further decompose Rna as Pm * Ato, in which Pm is profit margin and Ato is asset turnover. +Following Soliman (2008), we use annual sorts to form Rna, Pm, and Ato deciles. At the end of June of year t, we measure Rna as operating income after depreciation (Compustat annual item OIADP) for the fiscal year ending in calendar year t - 1 divided by net operating assets (Noa) for the fiscal year ending in t - 2. Noa is operating assets minus operating liabilities. Operating assets are total assets (item AT) minus cash and short-term investment (item CHE), and minus other investment and advances (item IVAO, zero if missing). Operating liabilities are total assets minus debt in current liabilities (item DLC, zero if missing), minus long-term debt (item DLTT, zero if missing), minus minority interests (item MIB, zero if missing), minus preferred stocks (item PSTK, zero if missing), and minus common equity (item CEQ). Pm is operating income after depreciation divided by sales (item SALE) for the fiscal year ending in calendar year t - 1. Ato is sales for the fiscal year ending in calendar year t - 1 divided by Noa for the fiscal year ending in t - 2. At the end of June of each year t, we sort stocks into three sets of deciles based on Rna, Pm, and Ato. We exclude firms with non-positive Noa for the fiscal year ending in calendar year t - 2 when forming the Rna and the Ato portfolios. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.6,Cto,Capital Turnover,Haugen and Baker,1996,Profitability,data_rawa['cto'] = data_rawa['sale'] / data_rawa['at'].shift(1),,0,1,0,0,"At the end of June of each year t, we split stocks into deciles based on capital turnover, Cto, measured as sales (Compustat annual item SALE) for the fiscal year ending in calendar year t - 1 divided by total assets (item AT) for the fiscal year ending in t - 2. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.4.7,"Rna q 1, Rna q 6, Atoq1","Quarterly Return on Net Operating Assets, Quarterly Asset Turnover",Soliman,2008,Profitability,data_rawq['rna'] = data_rawq['oiadpq']/data_rawq['noa_l4'],"* noa +data_rawq['noa_l4'] = data_rawq.groupby(['permno'])['noa'].shift(4)",1,0,1,0,"Quarterly return on net operating assets, Rnaq, is quarterly operating income after depreciation (Compustat quarterly item OIADPQ) divided by one-quarter-lagged net operating assets (Noa). Noa is operating assets minus operating liabilities. Operating assets are total assets (item ATQ) minus cash and short-term investments (item CHEQ), and minus other investment and advances (item IVAOQ, zero if missing). Operating liabilities are total assets minus debt in current liabilities (item DLCQ, zero if missing), minus long-term debt (item DLTTQ, zero if missing), minus minority interests (item MIBQ, zero if missing), minus preferred stocks (item PSTKQ, zero if missing), and minus common equity (item CEQQ). Quarterly profit margin, Pmq, is quarterly operating income after depreciation divided by quarterly sales (item SALEQ). Quarterly asset turnover, Atoq, is quarterly sales divided by one-quarter-lagged Noa. +At the beginning of each month t, we sort stocks into deciles based on Rnaq or Pmq for the latest fiscal quarter ending at least four months ago. Separately, we sort stocks into deciles based on Atoq computed with quarterly sales from the most recent earnings announcement dates (item RDQ). Sales are generally announced with earnings during quarterly earnings announcements (Je- gadeesh and Livnat 2006). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent Atoq to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for month t (Rnaq1, Pmq1, and Atoq1), from month t to t+5 (Rnaq6, Pmq6, and Atoq6), and from month t to t+11 (Rnaq12, Pmq12, and Atoq12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Atoq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Atoq6 decile. For sufficient data coverage, the Rnaq portfolios start in January 1976 and the Atoq portfolios start in January 1972. +" +A.4.8,"Cto q1, Cto q6,",Quarterly Capital Turnover,Haugen and Baker,1996,Profitability,,,,,,,"Quarterly capital turnover, Ctoq, is quarterly sales (Compustat quarterly item SALEQ) scaled by one-quarter-lagged total assets (item ATQ). At the beginning of each month t, we sort stocks into deciles based on Ctoq computed with quarterly sales from the most recent earnings announcement dates (item RDQ). Sales are generally announced with earnings during quarterly earnings announce- ments (Jegadeesh and Livnat 2006). For a firm to enter the portfolio formation, we require the end of the fiscal quarter that corresponds to its most recent Atoq to be within six months prior to the portfolio formation. This restriction is imposed to exclude stale information. To avoid potentially erroneous records, we also require the earnings announcement date to be after the corresponding fiscal quarter end. Monthly decile returns are calculated for month t (Ctoq1), from month t to t+5 (Ctoq6), and from month t to t + 11 (Ctoq12). The deciles are rebalanced at the beginning of t + 1. The holding period that is longer than one month as in, for instance, Ctoq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Ctoq6 decile. For sufficient data coverage, the Ctoq portfolios start in January 1972. +" +A.4.9,Gpa,Gross Profits-to-assets,Novy-Marx,2013,Profitability,,,,,,,"Following Novy-Marx (2013), we measure gross profits-to-assets, Gpa, as total revenue (Compustat annual item REVT) minus cost of goods sold (item COGS) divided by total assets (item AT, the denominator is current, not lagged, total assets). At the end of June of each year t, we sort stocks into deciles based on Gpa for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.5.1,Oca and Ioca,(Industry-adjusted) Organizational Capital-to-assets,Eisfeldt and Papanikolaou,2013,Intangibles,,,,,,,p101 +A.5.11,Rca,R&D Capital-to-assets,Li,2011,Intangibles,,,,,,,"Following Li (2011), we measure R&D capital, Rc, by accumulating annual R&D expenses over the past five years with a linear depreciation rate of 20%: +Rcit = XRDit + 0.8 XRDit-1 + 0.6 XRDit-2 + 0.4 XRDit-3 + 0.2 XRDit-4, (A18) +in which XRDit-j is firm i's R&D expenses (Compustat annual item XRD) in year t - j. R&D capital-to-assets, Rca, is Rc scaled by total assets (item AT). At the end of June of each year t, we sort stocks into deciles based on Rca for the fiscal year ending in calendar year t - 1. We keep only firms with positive Rc. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. For portfolio formation at the end of June of year t, we require R&D expenses to be non-missing for the fiscal year ending in calendar year t - 1, because this value of R&D expenses receives the highest weight in Rc. Because Rc requires past five years of R&D expenses data and the accounting treatment of R&D expenses was standardized in 1975, the Rca portfolios start in July 1980." +A.5.2,Adm,Advertising Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,data_rawa['adm'] = data_rawa['xad']/data_rawa['me'],* me from rawa,1,1,0,0,"At the end of June of each year t, we sort stocks into deciles based on advertising expenses-to- market, Adm, which is advertising expenses (Compustat annual item XAD) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Adm. We keep only firms with positive advertising expenses. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because sufficient XAD data start in 1972, the Adm portfolios start in July 1973." +A.5.24,Etr,Effective Tax Rate,Abarbanell and Bushee,1998,Intangibles,data_rawa['etr'] = (data_rawa['txtpi'] - (data_rawa['txtpi_l1'] + data_rawa['txtpi_l2'] + data_rawa['txtpi_l3'])/3) * data_rawa['deps'],"data_rawa['txtpi'] = data_rawa['txt'] / data_rawa['pi'] +data_rawa['txtpi_l1'] = data_rawa.groupby('permno')['txtpi'].shift(1) +data_rawa['txtpi_l2'] = data_rawa.groupby('permno')['txtpi'].shift(2) +data_rawa['txtpi_l3'] = data_rawa.groupby('permno')['txtpi'].shift(3) +data_rawa['deps'] = data_rawa['epspx']/(data_rawa['ajex'] * data_rawa['prcc_f'])",0,1,0,0,p108 +A.5.4,Rdm,R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'],"* me from rawq +# rd +data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) +data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4'])",1,0,1,0,"At the end of June of each year t, we sort stocks into deciles based on R&D-to-market, Rdm, which is R&D expenses (Compustat annual item XRD) for the fiscal year ending in calendar year t - 1 divided by the market equity (from CRSP) at the end of December of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Rdm. We keep only firms with positive R&D expenses. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the accounting treatment of R&D expenses was standardized in 1975, the Rdm portfolios start in July 1976." +A.5.46,"Alm q 1, Alm q 6, and Alm q 12",Quarterly Asset Liquidity,Ortiz-Molina and Phillips,2014,Intangibles,data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']),"data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ + 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) +* me from rawq",1,0,1,0,"We measure quarterly asset liquidity as cash + 0.75 * noncash current assets + 0.50 * tangible fixed assets. Cash is cash and short-term investments (Compustat quarterly item CHEQ). Noncash current assets is current assets (item ACTQ) minus cash. Tangible fixed assets is total assets (item ATQ) minus current assets (item ACTQ), minus goodwill (item GDWLQ, zero if missing), and minus intangibles (item INTANQ, zero if missing). Alaq is quarterly asset liquidity scaled by one- quarter-lagged total assets. Almq is quarterly asset liquidity scaled by one-quarter-lagged market value of assets. Market value of assets is total assets plus market equity (item PRCCQ times item CSHOQ) minus book equity (item CEQQ). +At the beginning of each month t, we sort stocks into deciles based on Alaq, and separately, on Almq for the fiscal quarter ending at least four months ago. Monthly decile returns are calculated for the current month t (Alaq1 and Almq1), from month t to t + 5 (Alaq6 and Almq6), and from month t to t+11 (Alaq12 and Almq12). The deciles are rebalanced at the beginning of month t+1. The holding period longer than one month as in Alaq6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Alaq6 decile. For sufficient data coverage, the quarterly asset liquidity portfolios start in January 1976. +" +A.5.5,"Rdm q 1, Rdm q 6, and Rdm q 12",Quarterly R&D Expense-to-market,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,,,,,,,"At the beginning of each month t, we split stocks into deciles based on quarterly R&D-to-market, Rdmq, which is quarterly R&D expense (Compustat quarterly item XRDQ) for the fiscal quarter ending at least four months ago scaled by the market equity (from CRSP) at the end of t - 1. For firms with more than one share class, we merge the market equity for all share classes before computing Rdmq. We keep only firms with positive R&D expenses. We calculate decile returns for the current month t (Rdmq1), from month t to t + 5 (Rdmq6), and from month t to t + 11 (Rdmq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Rdmq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Rdmq6 decile. Because the quarterly R&D data start in late 1989, the Rdmq portfolios start in January 1990." +A.5.50,"R a 1 , R n 1 , R a [2,5] , R n[2,5] , R a[6,10] , R n[6,10] , R a[11,15] , and R a[16,20]",Seasonality,Heston and Sadka,2008,Intangibles,,"* crsp_mom +#Rla +crsp_mom['rla'] = crsp_mom.groupby(['permno'])['ret'].shift(12) + +#Rln +lag = pd.DataFrame() +result = 0 +for i in range(1, 12): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['rln'] = result/11 + +#R[2,5]a +#R[2,5]n +lag = pd.DataFrame() +result = 0 +for i in range(13,61): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + if i not in [24,36,48,60]: + result = result + lag['mom%s' % i] + +crsp_mom['r25a'] = (lag['mom24']+lag['mom36']+lag['mom48']+lag['mom60'])/4 +crsp_mom['r25n'] = result/44 + +#R[6,10]a +#R[6,10]n +lag = pd.DataFrame() +result = 0 +for i in range(61,121): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + if i not in [72,84,96,108,120]: + result = result + lag['mom%s' % i] + +crsp_mom['r610a'] = (lag['mom72']+lag['mom84']+lag['mom96']+lag['mom108']+lag['mom120'])/5 +crsp_mom['r610n'] = result/55 + +#R[11,15]a +lag = pd.DataFrame() +result = 0 +for i in [132,144,156,168,180]: + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['r1115a'] = result/5 + +#R[16,20]a +lag = pd.DataFrame() +result = 0 +for i in [192,204,216,228,240]: + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['r1620a'] = result/5",1,0,0,0,"Following Heston and Sadka (2008), at the beginning of each month t, we sort stocks into deciles +based on various measures of past performance, including returns in month t - 12 (Ra1), average +returns from month t - 11 to t - 1 (Rn1), average returns across months t - 24,t - 36,t - 48, and +t - 60 (R[2,5]), average returns from month t - 60 to t - 13 except for lags 24, 36, 48, and 60 (R[2,5]), an +average returns across months t - 72, t - 84, t - 96, t - 108, and t - 120 (R[6,10]), average returns a +from month t - 120 to t - 61 except for lags 72, 84, 96, 108, and 120 (R[6,10]), average returns across n +months t - 132, t - 144, t - 156, t - 168, and t - 180 (R[11,15]), average returns from month t - 180 a +to t - 121 except for lags 132, 144, 156, 168, and 180 (R[11,15]), average returns across months n +t-192,t-204,t-216,t-228, and t-240 (R[16,20]), average returns from month t-240 to t-181 a +except for lags 192, 204, 216, 228, and 240 (R[16,20]). Monthly decile returns are calculated for the n +current month t, and the deciles are rebalanced at the beginning of month t + 1." +A.5.6,Rds q 6 and Rds q 12,Quarterly R&D Expense-to-sales,"Chan, Lakonishok, and Sougiannis",2001,Intangibles,data_rawq['rds'] = data_rawq['xrdq4']/data_rawq['saleq'],* xrdq4 from rdm,0,0,1,0,"At the end of June of each year t, we sort stocks into deciles based on R&D-to-sales, Rds, which is R&D expenses (Compustat annual item XRD) divided by sales (item SALE) for the fiscal year ending in calendar year t - 1. We keep only firms with positive R&D expenses. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1. Because the accounting treatment of R&D expenses was standardized in 1975, the Rds portfolios start in July 1976." +A.5.8,Ol,Operating Leverage,Novy-Marx,2011,Intangibles,data_rawa['ol'] = (data_rawa['cogs'] + data_rawa['xsga'])/data_rawa['at'],,0,1,0,0,"Following Novy-Marx (2011), operating leverage, Ol, is operating costs scaled by total assets (Com- pustat annual item AT, the denominator is current, not lagged, total assets). Operating costs are cost of goods sold (item COGS) plus selling, general, and administrative expenses (item XSGA). At the end of June of year t, we sort stocks into deciles based on Ol for the fiscal year ending in calendar year t - 1. Monthly decile returns are calculated from July of year t to June of t + 1, and the deciles are rebalanced in June of t + 1." +A.5.9,"Ol q 1, Ol q 6, and Ol q 12",Quarterly Operating Leverage,Novy-Marx,2011,Intangibles,data_rawq['olq'] = (data_rawq['cogsq'] + data_rawq['xsgaq'])/data_rawq['atq'],,0,0,1,0,"At the beginning of each month t, we split stocks into deciles based on quarterly operating leverage, Olq, which is quarterly operating costs divided by assets (Compustat quarterly item ATQ) for the fiscal quarter ending at least four months ago. Operating costs are the cost of goods sold (item COGSQ) plus selling, general, and administrative expenses (item XSGAQ). We calculate decile returns for the current month t (Olq1), from month t to t + 5 (Olq6), and from month t to t + 11 (Olq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Olq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Olq6 decile. For sufficient data coverage, the Olq portfolios start in January 1972." +A.6.1,Me,the market equity,Banz,1981,Frictions,"# rawq['me'] +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) +data_rawq['me'] = data_rawq['me']/1000 # CRSP ME +data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) +data_rawq = data_rawq.dropna(subset=['me'])","#rawa['me'] +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) +data_rawa['me'] = data_rawa['me']/1000 # CRSP ME +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) +data_rawa = data_rawa.dropna(subset=['me']) +# rawq['me'] +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) +data_rawq['me'] = data_rawq['me']/1000 # CRSP ME +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) +data_rawq = data_rawq.dropna(subset=['me'])",1,1,1,0,"Market equity, Me, is price times shares outstanding from CRSP. At the end of June of each year t, we sort stocks into deciles based on the June-end Me. Monthly decile returns are calculated from July of year t to June of t+1, and the deciles are rebalanced in June of t+1." +A.6.13,Dtv12,"dollar trading volume, 12-month holding period","Brennan, Chordia, and Subrahmanyam",1998,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on their average daily dollar trading volume, Dtv, over the prior six months from t-6 to t-1. We require a minimum of 50 daily observations. Dollar trading volume is share price times the number of shares traded. We adjust the trading volume of NASDAQ stocks per Gao and Ritter (2010) (see footnote 7). Monthly decile returns are calculated for the current month t (Dtv1), from month t to t+5 (Dtv6), and from month t to t + 11 (Dtv12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in, for instance, Dtv6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Dtv6 decile." +A.6.21,Isff1,"idiosyncratic skewness estimated from the Fama-French 3-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on idiosyncratic skewness, Isff, calculated as the skewness of the residuals from regressing a stock's excess return on the Fama- French three factors using daily observations from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Isff1), from month t to t + 5 (Isff6), and from month t to t + 11 (Isff12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in Isff6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Isff6 decile." +A.6.22,Isq1,"idiosyncratic skewness estimated from the q-factor model, 1-month holding period",Harvey and Siddique,2000,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into deciles based on idiosyncratic skewness, Isq, calculated as the skewness of the residuals from regressing a stock's excess return on the q-factors using daily observations from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Isq1), from month t to t + 5 (Isq6), and from month t to t + 11 (Isq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period longer than one month as in Isq6 means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six months. We take the simple average of the subdecile returns as the monthly return of the Isq6 decile. Because the q-factors start in January 1967, the Ivq portfolios start in February 1967." +A.6.24,Srev,short-term reversal,Jegadeesh,1990,Frictions,,,,,,,"At the beginning of each month t, we sort stocks into short-term reversal (Srev) deciles based on the return in month t - 1. To be included in a decile in month t, a stock must have a valid price at the end of month t - 2 and a valid return for month t - 1. Monthly decile returns are calculated for the current month t, and the deciles are rebalanced at the beginning of month t + 1." +A.6.3,Ivff1,"idiosyncratic volatility estimated from the Fama-French 3-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,"Following Ang, Hodrick, Xing, and Zhang (2006), we calculate idiosyncratic volatility relative to the Fama-French three-factor model, Ivff, as the residual volatility from regressing a stock's excess returns on the Fama-French three factors. At the beginning of each month t, we sort stocks into deciles based on the Ivff estimated with daily returns from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Ivff1), from month t to t+5 (Ivff6), and from month t to t+11 (Ivff12), and the deciles are rebalanced at the beginning of month t + 1. The holding period that is longer than one month as in, for instance, Ivff6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Ivff6 decile." +A.6.5,Ivq1,"idiosyncratic volatility estimated from the q-factor model, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,"We calculate idiosyncratic volatility per the q-factor model, Ivq, as the residual volatility from regressing a stock's excess returns on the q-factors. At the beginning of each month t, we sort stocks into deciles based on the Ivq estimated with daily returns from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t (Ivq1), from month t to t + 5 (Ivq6), and from month t to t + 11 (Ivq12), and the deciles are rebalanced at the beginning of month t + 1. The holding period that is longer than one month as in, for instance, Ivq6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdecile returns as the monthly return of the Ivq6 decile. Because the q-factors start in January 1967, the Ivq portfolios start in February 1967." +A.6.6,Tv1,"total volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,"Following Ang, Hodrick, Xing, and Zhang (2006), at the beginning of each month t, we sort stocks into deciles based on total volatility, Tv, estimated as the volatility of a stock's daily returns from month t - 1. We require a minimum of 15 daily returns. Monthly decile returns are calculated for the current month t, (Tv1), from month t to t + 5 (Tv6), and from month t to t + 11 (Tv12), and the deciles are rebalanced at the beginning of month t + 1. The holding period that is longer than one month as in, for instance, Tv6, means that for a given decile in each month there exist six subdeciles, each of which is initiated in a different month in the prior six-month period. We take the simple average of the subdeciles returns as the monthly return of the Tv6 decile." +A.6.7,Sv1,"systematic volatility, 1-month holding period","Ang, Hodrick, Xing, and Zhang",2006,Frictions,,,,,,,p119 +A.6.8,Beta1,Market Beta,Fama and MacBeth,1973,Frictions,,,,,,,p119 +,agr,Asset growth,"Cooper, Gulen & Schill",2008,,data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'],data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4),1,0,1,0,Annual percent change in total assets (at). +,baspread,Bid-ask spread rolling 3m,Amihud & Mendelson,1989,,,,,,,,Monthly average of daily bid-ask spread divided by average of daily spread. +,beta,Beta rolling 3m,Fama & MacBeth,1973,,,,,,,,Estimated market beta from weekly returns and equal weighted market returns for 3 years ending month t-1 with at least 52 weeks of returns. +,bm_ia,Industry-adjusted book to market,"Asness, Porter & Stevens",2000,,data_rawa['bm_ia'] = data_rawa['bm']/data_rawa['bm_ind'],"df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() +df_temp = df_temp.rename(columns={'bm': 'bm_ind'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49'])",0,1,0,0,Industry adjusted book-to-market ratio. +,cash,Cash holdings,Palazzo,2012,,data_rawq['cash'] = data_rawq['cheq']/data_rawq['atq'],,0,0,1,0,Cash and cash equivalents divided by average total assets. +,cashdebt,Cash flow to debt,Ou & Penman,1989,,"data_rawq['cashdebt'] = (ttm4('ibq', data_rawq) + ttm4('dpq', data_rawq))/((data_rawq['ltq']+data_rawq['ltq_l4'])/2)",data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4),1,0,1,0,Earnings before depreciation and extraordinary items (ib+dp) divided by avg. total liabilities (lt). +,chcsho,Change in shares outstanding,Pontiff & Woodgate,2008,,data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1,data_rawq['cshoq_l4'] = data_rawq.groupby(['permno'])['cshoq'].shift(4),1,0,1,0,Annual percent change in shares outstanding (csho). +,chpm(chpmia),Industry-adjusted change in profit margin,Soliman,2008,,data_rawq['chpm'] = (data_rawq['ibq4']/data_rawq['saleq4'])-(data_rawq['ibq4_l1']/data_rawq['saleq4_l1']),"data_rawq['ibq4'] = ttm4('ibq', data_rawq) +data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) +data_rawq['ibq4_l1'] = data_rawq.groupby(['permno'])['ibq4'].shift(1) +data_rawq['saleq4_l1'] = data_rawq.groupby(['permno'])['saleq4'].shift(1)",1,0,1,0,2-digit SIC - fiscal-year mean adjusted change in income before extraordinary items (ib) divided by sales (sale). +,chtx,Change in tax expense,Thomas & Zhang,2011,,data_rawq['chtx'] = (data_rawq['txtq']-data_rawq['txtq_l4'])/data_rawq['atq_l4'],"data_rawq['txtq_l4'] = data_rawq.groupby(['permno'])['txtq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4)",1,0,1,0,Percent change in total taxes (txtq) from quarter t-4 to t. +,cinvest,Corporate investment,"Titman, Wei & Xie",2004,,"* data_rawq['cinvest'] = ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / data_rawq['saleq'])\ + -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1))","data_rawq['ppentq_l1'] = data_rawq.groupby(['permno'])['ppentq'].shift(1) +data_rawq['ppentq_l2'] = data_rawq.groupby(['permno'])['ppentq'].shift(2) +data_rawq['ppentq_l3'] = data_rawq.groupby(['permno'])['ppentq'].shift(3) +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['saleq_l1'] = data_rawq.groupby(['permno'])['saleq'].shift(1) +data_rawq['saleq_l2'] = data_rawq.groupby(['permno'])['saleq'].shift(2) +data_rawq['saleq_l3'] = data_rawq.groupby(['permno'])['saleq'].shift(3) + +data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / data_rawq['saleq_l1'] +data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / data_rawq['saleq_l2'] +data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / data_rawq['saleq_l3'] + +* main formula + +data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / 0.01 +data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / 0.01 +data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / 0.01 + +data_rawq['cinvest'] = np.where(data_rawq['saleq']<=0, ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / 0.01) + -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)), data_rawq['cinvest']) + +data_rawq = data_rawq.drop(['c_temp1', 'c_temp2', 'c_temp3'], axis=1)",1,0,1,0,"Change over one quarter in net PP&E (ppentq) divided by sales (saleq) - average of this variable for prior 3 quarters; if saleq = 0, then scale by 0.01." +,depr,Depreciation / PP&E,Holthausen & Larcker,1992,,"data_rawq['depr'] = ttm4('dpq', data_rawq)/data_rawq['ppentq']",,0,0,1,0,Depreciation divided by PP&E. +,dolvol,Dollar trading volume,"Chordia, Subrahmanyam & Anshuman",2001,,"crsp_mom['dolvol'] = np.log(crsp_mom['vol_l2']*crsp_mom['prc_l2']).replace([np.inf, -np.inf], np.nan)",,1,0,0,0,Natural log of trading volume times price per share from month t-2. +,gma,Gross profitability,Novy-Marx,2013,,data_rawq['gma'] = (data_rawq['revtq4']-data_rawq['cogsq4'])/data_rawq['atq_l4'],"data_rawq['revtq4'] = ttm4('revtq', data_rawq) +data_rawq['cogsq4'] = ttm4('cogsq', data_rawq) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4)",1,0,1,0,Revenues (revt) minus cost of goods sold (cogs) divided by lagged total assets (at). +,grltnoa,Growth in long-term net operating assets,"Fairfield, Whisenant & Yohn",2003,,"data_rawq['grltnoa'] = ((data_rawq['rectq']+data_rawq['invtq']+data_rawq['ppentq']+data_rawq['acoq']+data_rawq['intanq']+ + data_rawq['aoq']-data_rawq['apq']-data_rawq['lcoq']-data_rawq['loq'])- + (data_rawq['rectq_l4']+data_rawq['invtq_l4']+data_rawq['ppentq_l4']+data_rawq['acoq_l4']-data_rawq['apq_l4']-data_rawq['lcoq_l4']-data_rawq['loq_l4'])-\ + (data_rawq['rectq']-data_rawq['rectq_l4']+data_rawq['invtq']-data_rawq['invtq_l4']+data_rawq['acoq']- + (data_rawq['apq']-data_rawq['apq_l4']+data_rawq['lcoq']-data_rawq['lcoq_l4'])- + ttm4('dpq', data_rawq)))/((data_rawq['atq']+data_rawq['atq_l4'])/2)","data_rawq['rectq_l4'] = data_rawq.groupby(['permno'])['rectq'].shift(4) +data_rawq['acoq_l4'] = data_rawq.groupby(['permno'])['acoq'].shift(4) +data_rawq['apq_l4'] = data_rawq.groupby(['permno'])['apq'].shift(4) +data_rawq['lcoq_l4'] = data_rawq.groupby(['permno'])['lcoq'].shift(4) +data_rawq['loq_l4'] = data_rawq.groupby(['permno'])['loq'].shift(4) +data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4)",1,0,1,0,Growth in long term net operating assets. +,herf,Industry sales concentration,Hou & Robinson,2006,,data_rawa['herf'] = (data_rawa['sale']/data_rawa['indsale'])*(data_rawa['sale']/data_rawa['indsale']),"data_rawa['sic'] = data_rawa['sic'].astype(int) +data_rawa['ffi49'] = ffi49(data_rawa) +data_rawa['ffi49'] = data_rawa['ffi49'].fillna(49) +data_rawa['ffi49'] = data_rawa['ffi49'].astype(int) +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['sale'].sum() +df_temp = df_temp.rename(columns={'sale': 'indsale'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +* main formula +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['herf'].sum() +data_rawa = data_rawa.drop(['herf'], axis=1) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49'])",0,1,0,0,2-digit SIC - fiscal-year sales concentration (sum of squared percent of sales in industry for each company). +,hire,Employee growth rate,"Bazdresch, Belo & Lin",2014,,"data_rawa['hire'] = (data_rawa['emp'] - data_rawa['emp_l1'])/data_rawa['emp_l1'] +data_rawa['hire'] = np.where((data_rawa['emp'].isnull()) | (data_rawa['emp_l1'].isnull()), 0, data_rawa['hire'])",data_rawa['emp_l1'] = data_rawa.groupby(['permno'])['emp'].shift(1),1,1,0,0,Percent change in number of employees (emp). +,ill,Illiquidity rolling 3m,Amihud,2002,,,,,,,,Average of daily (absolute return / dollar volume). +,lev,Leverage,Bhandari,1988,,data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'],* me from rawq,0,0,1,0,Total liabilities (lt) divided by fiscal year end market capitalization. +,lgr,Growth in long-term debt,"Richardson, Sloan, Soliman & Tuna",2005,,data_rawq['lgr'] = (data_rawq['ltq']/data_rawq['ltq_l4'])-1,data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4),1,0,1,0,Annual percent change in total liabilities (lt). +,maxret,Maximum daily returns rolling 3m,"Bali, Cakici & Whitelaw",2011,,,,,,,,Maximum daily return from returns during calendar month t-1. +,me_ia(mve_ia),Industry-adjusted size,"Asness, Porter & Stevens",2000,,data_rawa['me_ia'] = data_rawa['me']/data_rawa['me_ind'],"* me from rawa +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['me'].mean() +df_temp = df_temp.rename(columns={'me': 'me_ind'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49'])",1,1,0,0,2-digit SIC industry-adjusted fiscal year-end market capitalization. +,mom12m,Momentum rolling 12m,Jegadeesh,1990,,"crsp_mom['mom12m'] = mom(1, 12, crsp_mom)",* crsp_mom,1,0,0,0,11-month cumulative returns ending one month before month end. +,mom1m,Momentum ,Jegadeesh & Titman,1993,,crsp_mom['mom1m'] = crsp_mom['ret'],* crsp_mom,1,0,0,0,1-month cumulative return. +,mom36m,Momentum rolling 36m,Jegadeesh & Titman,1993,,"crsp_mom['mom36m'] = mom(1, 36, crsp_mom)",* crsp_mom,1,0,0,0,Cumulative returns from months t-36 to t-13. +,mom60m,Momentum rolling 60m,Jegadeesh & Titman,1993,,"crsp_mom['mom60m'] = mom(12, 60, crsp_mom)",* crsp_mom,1,0,0,0, +,mom6m,Momentum rolling 6m,Jegadeesh & Titman,1993,,"crsp_mom['mom6m'] = mom(1, 6, crsp_mom)",* crsp_mom,1,0,0,0,5-month cumulative returns ending one month before month end. +,nincr,Number of earnings increases,"Barth, Elliott & Finn",1999,,"data_rawq['nincr'] = (data_rawq['nincr_temp1'] + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']*data_rawq['nincr_temp8']))","data_rawq['ibq_l1'] = data_rawq.groupby(['permno'])['ibq'].shift(1) +data_rawq['ibq_l2'] = data_rawq.groupby(['permno'])['ibq'].shift(2) +data_rawq['ibq_l3'] = data_rawq.groupby(['permno'])['ibq'].shift(3) +data_rawq['ibq_l4'] = data_rawq.groupby(['permno'])['ibq'].shift(4) +data_rawq['ibq_l5'] = data_rawq.groupby(['permno'])['ibq'].shift(5) +data_rawq['ibq_l6'] = data_rawq.groupby(['permno'])['ibq'].shift(6) +data_rawq['ibq_l7'] = data_rawq.groupby(['permno'])['ibq'].shift(7) +data_rawq['ibq_l8'] = data_rawq.groupby(['permno'])['ibq'].shift(8) + +data_rawq['nincr_temp1'] = np.where(data_rawq['ibq'] > data_rawq['ibq_l1'], 1, 0) +data_rawq['nincr_temp2'] = np.where(data_rawq['ibq_l1'] > data_rawq['ibq_l2'], 1, 0) +data_rawq['nincr_temp3'] = np.where(data_rawq['ibq_l2'] > data_rawq['ibq_l3'], 1, 0) +data_rawq['nincr_temp4'] = np.where(data_rawq['ibq_l3'] > data_rawq['ibq_l4'], 1, 0) +data_rawq['nincr_temp5'] = np.where(data_rawq['ibq_l4'] > data_rawq['ibq_l5'], 1, 0) +data_rawq['nincr_temp6'] = np.where(data_rawq['ibq_l5'] > data_rawq['ibq_l6'], 1, 0) +data_rawq['nincr_temp7'] = np.where(data_rawq['ibq_l6'] > data_rawq['ibq_l7'], 1, 0) +data_rawq['nincr_temp8'] = np.where(data_rawq['ibq_l7'] > data_rawq['ibq_l8'], 1, 0) + +*main formula + +data_rawq = data_rawq.drop(['ibq_l1', 'ibq_l2', 'ibq_l3', 'ibq_l4', 'ibq_l5', 'ibq_l6', 'ibq_l7', 'ibq_l8', 'nincr_temp1', + 'nincr_temp2', 'nincr_temp3', 'nincr_temp4', 'nincr_temp5', 'nincr_temp6', 'nincr_temp7', + 'nincr_temp8'], axis=1)",1,0,1,0,Number of consecutive quarters (up to eight quarters) with an increase in earnings (ibq) over same quarter in the prior year. +,op(operprof),Operating profitability,Fama and French,2015,,,,,,,, +,pscore(ps),Performance Score,Piotroski,2000,,"data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps'])",#(pstkrv prior to pstkl prior to pstk),0,1,0,0,Sum of 9 indicator variables to form fundamental health score. +,rd_sale,R&D to sales,"Guo, Lev & Shi",2006,,data_rawq['rd_sale'] = data_rawq['xrdq4']/data_rawq['saleq4'],"data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) +data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4']) +data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4'])",0,0,1,0,R&D expense divided by sales (xrd/sale). +,re,Revisions in analysts’ earnings forecasts,"Chan, Jegadeesh, and Lakonishok",1996,,,,,,,, +,rsup,Revenue surprise,Kama,2009,,data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'],data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4),1,0,1,0,Sales from quarter t minus sales from quarter t-4 (saleq) divided by fiscal-quarter- end market capitalization (cshoq * prccq). +,rvar_capm,Residual variance - CAPM rolling 3m,Daily Stock residual variance of CAPM,,,,,,,,, +,rvar_ff3,Residual variance - ff3 rolling 3m,Daily Stock residual variance of Fama French 3 factors,,,,,,,,, +,rvar_mean,return variance rolling 3m,Daily Stock return variance,,,,,,,,, +,sgr,Sales growth,"Lakonishok, Shleifer & Vishny",1994,,data_rawq['sgr'] = (data_rawq['saleq4']/data_rawq['saleq4_l4'])-1,"data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) + +data_rawq['saleq4_l4'] = data_rawq.groupby(['permno'])['saleq4'].shift(4)",1,0,1,0,Annual percent change in sales (sale). +,std_dolvol,Std of dollar trading volume rolling 3m,"Chordia, Subrahmanyam & Anshuman",2001,,,,,,,,Monthly standard deviation of daily dollar trading volume. +,std_turn,Std. of Share turnover rolling 3m,"Chordia, Subrahmanyam, &Anshuman",2001,,,,,,,,Monthly standard deviation of daily share turnover. +,sue,Unexpected quarterly earnings,"Rendelman, Jones & Latane",1982,,,,,,,,"Unexpected quarterly earnings divided by fiscal-quarter-end market cap. Unexpected earnings is I/B/E/S actual earnings minus median forecasted earnings if available, else it is the seasonally differenced quarterly earnings before extraordinary items from Compustat quarterly file." +,turn,Shares turnover,"Datar, Naik & Radcliffe",1998,,,,,,,,Average monthly trading volume for most recent 3 months scaled by number of shares outstanding in current month. +,zerotrade,Number of zero-trading days rolling 3m,Liu,2006,,,,,,,,Turnover weighted number of zero trading days for most recent 1 month. diff --git a/README.md b/README.md new file mode 100755 index 0000000..d1cb73f --- /dev/null +++ b/README.md @@ -0,0 +1,109 @@ +- All in Python +- The SAS version is here [EquityCharacteristicsSAS](https://feng-cityuhk.github.io/EquityCharacteristicsSAS/) + +## Academic Background + +For financial researches, we need equity characteristics. This repository is a toolkit to calculate asset characteristics in individual equity level and portfolio level. + +## Prerequisite + +- Read the listed papers +- [WRDS](https://wrds-web.wharton.upenn.edu) account with subscription to CRSP, Compustat and IBES. +- Python + +## Files + +- [Characteristics list](https://github.com/ericma4/EquityCharacteristics/blob/master/Chars60_description.csv) + +### Main Files +- accounting.py -- most annual, quarterly and monthly frequency characteristics +- functions.py -- impute and rank functions +- merge_chars.py -- merge all the characteristics from different pickle file into one pickle file +- impute_rank_output_bchmk.py -- impute the missing values and standardize raw data +- iclink.py -- preparation for IBES +- pkl_to_csv.py -- converge the pickle file to csv + +### Single Characteristic Files +- beta.py -- 3 months rolling CAPM beta +- rvar_capm.py, rvar_ff3.py -- residual variance of CAPM and fama french 3 factors model, rolling window is 3 months +- rvar_mean.py -- variance of return, rolling window is 3 months +- abr.py -- cumulative abnormal returns around earnings announcement dates +- re.py -- revisions in analysts’ earnings forecasts +- sue.py -- unexpected quarterly earnings +- ill.py -- illiquidity, rolling window is 3 months +- maxret_d.py -- maximum daily returns, rolling window is 3 months +- std_dolvol.py -- std of dollar trading volume, rolling window is 3 months +- std_turn.py -- std of share turnover, rolling window is 3 months +- bid_ask_spread.py -- bid-ask spread, rolling window is 3 months +- zerotrade.py -- number of zero-trading days, rolling window is 3 months + +## How to use + +1. run accounting.py +2. run all the single characteristic files +3. run merge_chars.py +4. run impute_rank_output_bckmk.py (you may want to commen the part of sp1500 in this file if you just need the all stocks version) + +## Outputs + +### Data + +The date range is 1972 to 2019. The stock universe is top 3 exchanges (NYSE/AMEX/NASDAQ) in US. + +The currant time of data is $ret_t = chars_{t-1}$ + +1. chars_raw_no_impute.pkl (all data with original missing value) +2. chars_raw_imputed.pkl (impute missing value with industry median/mean value) +3. chars_rank_no_imputed.pkl (standardize chars_raw_no_impute.pkl) +4. chars_rank_imputed.pkl (standardize chars_raw_imputed.pkl) + +### Information Variables: + +- stock indicator: gvkey, permno +- time: datadate, date, year ('datadate' is the available time for data and 'date' is the date of return) +- industry: sic, ffi49 +- exchange info: exchcd, shrcd +- return: ret (we also provide original return and return without dividend, you can keep them by modifing impute_rank_output_bchmk.py) +- market equity: me/rank_me + +## Method + +### Equity Characteristics + +This topic is summaried by **Green Hand Zhang** and **Hou Xue Zhang**. + +### Portfolio Characteristics + +Portfolio charactaristics is the equal-weighted / value-weighted averge of the characteristics for all equities in the portfolio. + +The portfolios includes and not limited to: + +- Characteristics-sorted Portfolio, see the listed papers and also [Deep Learning in Characteristics-Sorted Factor Models](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3243683) +- DGTW Benchmark, see [DGTW 1997 JF](https://doi.org/10.1111/j.1540-6261.1997.tb02724.x) +- Industry portfolio + +## Reference + +### Papers + +Many papers contribute a lot to this repository. I am very sorry for only listing the following papers. +- **Measuring Mutual Fund Performance with Characteristic‐Based Benchmarks** by [DANIEL, GRINBLATT, TITMAN, WERMERS 1997 JF](https://doi.org/10.1111/j.1540-6261.1997.tb02724.x) + - [Benchmarks on Wermer's website](http://terpconnect.umd.edu/~wermers/ftpsite/Dgtw/coverpage.htm) + +- **Dissecting Anomalies with a Five-Factor Model** by [Fama and French 2015 RFS](https://doi.org/10.1093/rfs/hhv043) + - Define the characteristics of a portfolio as the value-weight averages (market-cap weights) of the variables for the firms in the portfolio + - [French's Data Library](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html) + +- **The Characteristics that Provide Independent Information about Average U.S. Monthly Stock Returns** by [Green Hand Zhang 2017 RFS](https://doi.org/10.1093/rfs/hhx019) + - [sas code from Green's website](https://drive.google.com/file/d/0BwwEXkCgXEdRQWZreUpKOHBXOUU/view) +- **Replicating Anormalies** by [Hou Xue Zhang 2018 RFS](https://doi.org/10.1093/rfs/hhy131) + - [Anormaly Portfolios by Zhang's website](http://global-q.org/index.html) + +### Codes + +- Calculate equity characteristics with SAS code, mainly refering to [SAS code by Green Hand Zhang](https://drive.google.com/file/d/0BwwEXkCgXEdRQWZreUpKOHBXOUU/view). +- Portfolio characteristics, mainly refering to [WRDS Financial Ratios Suite](https://wrds-www.wharton.upenn.edu/pages/support/research-wrds/sample-programs/wrds-sample-programs/wrds-financial-ratios-suite/) and [Variable Definition](https://wrds-www.wharton.upenn.edu/documents/793/WRDS_Industry_Financial_Ratio_Manual.pdf) +- DGTW code refers to [this python code](https://wrds-www.wharton.upenn.edu/pages/support/applications/python-replications/characteristic-based-benchmarks-daniel-grinblatt-titman-and-wermers-1997-python-version/) or [this SAS code](https://wrds-www.wharton.upenn.edu/pages/support/applications/portfolio-construction-and-market-anomalies/characteristic-based-benchmarks-daniel-grinblatt-titman-and-wermers-1997/) + +**All comments are welcome.** + diff --git a/char60/abr.py b/char60/abr.py new file mode 100755 index 0000000..ecb5219 --- /dev/null +++ b/char60/abr.py @@ -0,0 +1,236 @@ +# Calculate HSZ Replicating Anomalies +# ABR: Cumulative abnormal stock returns around earnings announcements + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import pickle as pkl +import sqlite3 + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +################### +# Compustat Block # +################### +comp = conn.raw_sql(""" + select gvkey, datadate, rdq, fyearq, fqtr + from comp.fundq + where indfmt = 'INDL' + and datafmt = 'STD' + and popsrc = 'D' + and consol = 'C' + and datadate >= '01/01/1959' + """) + +comp['datadate'] = pd.to_datetime(comp['datadate']) + +print('='*10, 'comp data is ready', '='*10) +################### +# CCM Block # +################### +ccm = conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where linktype in ('LU', 'LC') + """) + +ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) +ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) + +# if linkenddt is missing then set to today date +ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) + +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) +# extract month and year of rdq +ccm1['rdq'] = pd.to_datetime(ccm1['rdq']) + +# set link date bounds +ccm2 = ccm1[(ccm1['datadate']>=ccm1['linkdt']) & (ccm1['datadate']<=ccm1['linkenddt'])] +ccm2 = ccm2[['gvkey', 'datadate', 'rdq', 'fyearq', 'fqtr', 'permno']] + +################### +# CRSP Block # +################### + +# Report Date of Quarterly Earnings (rdq) may not be trading day, we need to get the first trading day on or after rdq +crsp_dsi = conn.raw_sql(""" + select distinct date + from crsp.dsi + where date >= '01/01/1959' + """) + +crsp_dsi['date'] = pd.to_datetime(crsp_dsi['date']) + +for i in range(6): # we only consider the condition that the day after rdq is not a trading day, which is up to 5 days + ccm2['trad_%s' % i] = ccm2['rdq'] + pd.DateOffset(days=i) # set rdq + i days to match trading day + crsp_dsi['trad_%s' % i] = crsp_dsi['date'] # set the merging key + crsp_dsi = crsp_dsi[['date', 'trad_%s' % i]] # reset trading day columns to avoid repeat merge + comp_temp = pd.merge(ccm2, crsp_dsi, how='left', on='trad_%s' % i) + comp_temp['trad_%s' % i] = comp_temp['date'] # reset rdq + i days to matched trading day + +# fill NA from rdq + 5 days to rdq + 0 days, then get trading day version of rdq +for i in range(5, 0, -1): + count = i-1 + comp_temp['trad_%s' % count] = np.where(comp_temp['trad_%s' % count].isnull(), + comp_temp['trad_%s' % i], comp_temp['trad_%s' % count]) + comp_temp['rdq_trad'] = comp_temp['trad_%s' % count] + +comp_temp = comp_temp[['gvkey', 'permno', 'datadate', 'fyearq', 'fqtr', 'rdq', 'rdq_trad']] + +print('='*10, 'crsp block is ready', '='*10) +############################# +# CRSP abnormal return # +############################# +crsp_d = conn.raw_sql(""" + select a.prc, a.ret, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.permno, a.permco, a.date, + b.siccd, b.ncusip, b.shrcd, b.exchcd + from crsp.dsf as a + left join crsp.dsenames as b + on a.permno=b.permno + and b.namedt<=a.date + and a.date<=b.nameendt + where a.date >= '01/01/1959' + and b.exchcd between 1 and 3 + and b.shrcd in (10,11) + """) + +# change variable format to int +crsp_d[['permco', 'permno', 'shrcd', 'exchcd']] = crsp_d[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) + +print('='*10, 'crsp abnormal return is ready', '='*10) + +# convert the date format +crsp_d['date'] = pd.to_datetime(crsp_d['date']) + +# add delisting return +dlret = conn.raw_sql(""" + select permno, dlret, dlstdt + from crsp.dsedelist + where dlstdt >= '01/01/1959' + """) + +dlret.permno = dlret.permno.astype(int) +dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) + +crsp_d = pd.merge(crsp_d, dlret, how='left', left_on=['permno', 'date'], right_on=['permno', 'dlstdt']) +# return adjusted for delisting +crsp_d['retadj'] = np.where(crsp_d['dlret'].notna(), (crsp_d['ret'] + 1)*(crsp_d['dlret'] + 1) - 1, crsp_d['ret']) +crsp_d['meq'] = crsp_d['prc'].abs()*crsp_d['shrout'] # market value of equity +crsp_d = crsp_d.sort_values(by=['date', 'permno', 'meq']) + +# sprtrn +crspsp500d = conn.raw_sql(""" + select date, sprtrn + from crsp.dsi + where date >= '01/01/1959' + """) + +crspsp500d['date'] = pd.to_datetime(crspsp500d['date']) + +# abnormal return +crsp_d = pd.merge(crsp_d, crspsp500d, how='left', on='date') +crsp_d['abrd'] = crsp_d['retadj'] - crsp_d['sprtrn'] +crsp_d = crsp_d[['date', 'permno', 'ret', 'retadj', 'sprtrn', 'abrd']] + +# date count regarding to rdq +comp_temp['minus10d'] = comp_temp['rdq_trad'] - pd.Timedelta(days=10) +comp_temp['plus5d'] = comp_temp['rdq_trad'] + pd.Timedelta(days=5) + +# df = sqldf("""select a.*, b.date, b.abrd +# from comp_temp a left join crsp_d b +# on a.permno=b.permno +# and a.minus10d<=b.date +# and b.date<=a.plus5d +# order by a.permno, a.rdq_trad, b.date;""", globals()) + +sql = sqlite3.connect(':memory:') +comp_temp.to_sql('comp_temp', sql, index=False) +crsp_d.to_sql('crsp_d', sql, index=False) + +qry = """select a.*, b.date, b.abrd + from comp_temp a left join crsp_d b + on a.permno=b.permno + and a.minus10d<=b.date + and b.date<=a.plus5d + order by a.permno, a.rdq_trad, b.date;""" +df = pd.read_sql_query(qry, sql) +df.drop(['plus5d', 'minus10d'], axis=1, inplace=True) + +# delete missing return +df = df[df['abrd'].notna()] + +# count +df.sort_values(by=['permno', 'rdq_trad', 'date'], inplace=True) +condlist = [df['date']==df['rdq_trad'], + df['date']>df['rdq_trad'], + df['date']=0] +df_after['count'] = df_after.groupby(['permno', 'rdq_trad'])['date'].cumcount() + +df = pd.concat([df_before, df_after]) + +# calculate abr as the group sum +df = df[(df['count']>=-2) & (df['count']<=1)] + +df_temp = df.groupby(['permno', 'rdq_trad'])['abrd'].sum() +df_temp = pd.DataFrame(df_temp) +df_temp.reset_index(inplace=True) +df_temp.rename(columns={'abrd': 'abr'}, inplace=True) +df = pd.merge(df, df_temp, how='left', on=['permno', 'rdq_trad'], copy=False) # add abr back to df +df = df[df['count']==1] +df.rename(columns={'date': 'rdq_plus_1d'}, inplace=True) +df = df[['gvkey', 'permno', 'datadate', 'rdq', 'rdq_plus_1d', 'abr']] + +print('='*10, 'start populate', '='*10) + +# populate the quarterly abr to monthly +crsp_msf = conn.raw_sql(""" + select distinct date + from crsp.msf + where date >= '01/01/1959' + """) + +df['datadate'] = pd.to_datetime(df['datadate']) +df['plus12m'] = df['datadate'] + np.timedelta64(12, 'M') +df['plus12m'] = df['plus12m'] + MonthEnd(0) + +# df = sqldf("""select a.*, b.date +# from df a left join crsp_msf b +# on a.rdq_plus_1d < b.date +# and a.plus12m >= b.date +# order by a.permno, b.date, a.datadate desc;""", globals()) + +df.to_sql('df', sql, index=False) +crsp_msf.to_sql('crsp_msf', sql, index=False) + +qry = """select a.*, b.date + from df a left join crsp_msf b + on a.rdq_plus_1d < b.date + and a.plus12m >= b.date + order by a.permno, b.date, a.datadate desc;""" + +df = pd.read_sql_query(qry, sql) + +df = df.drop_duplicates(['permno', 'date']) +df['datadate'] = pd.to_datetime(df['datadate']) +df['rdq'] = pd.to_datetime(df['rdq']) +df['rdq_plus_1d'] = pd.to_datetime(df['rdq_plus_1d']) +df = df[['gvkey', 'permno', 'datadate', 'rdq', 'rdq_plus_1d', 'abr', 'date']] + +with open('abr.pkl', 'wb') as f: + pkl.dump(df, f) \ No newline at end of file diff --git a/char60/accounting_100.py b/char60/accounting_100.py new file mode 100644 index 0000000..cf88aa6 --- /dev/null +++ b/char60/accounting_100.py @@ -0,0 +1,1643 @@ +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import pickle as pkl +from functions import * + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +####################################################################################################################### +# TTM functions # +####################################################################################################################### + + +def ttm4(series, df): + """ + + :param series: variables' name + :param df: dataframe + :return: ttm4 + """ + lag = pd.DataFrame() + for i in range(1, 4): + lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('gvkey')['%s' % series].shift(i) + result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] + return result + + +def ttm12(series, df): + """ + + :param series: variables' name + :param df: dataframe + :return: ttm12 + """ + lag = pd.DataFrame() + for i in range(1, 12): + lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('permno')['%s' % series].shift(i) + result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] +\ + lag['%s4' % series] + lag['%s5' % series] + lag['%s6' % series] + lag['%s7' % series] +\ + lag['%s8' % series] + lag['%s9' % series] + lag['%s10' % series] + lag['%s11' % series] + return result + +print('TTM') +####################################################################################################################### +# Compustat Block # +####################################################################################################################### +comp = conn.raw_sql(""" + /*header info*/ + select c.gvkey, f.cusip, f.datadate, f.fyear, c.cik, substr(c.sic,1,2) as sic2, c.sic, c.naics, + + /*firm variables*/ + /*income statement*/ + f.sale, f.revt, f.cogs, f.xsga, f.dp, f.xrd, f.xad, f.ib, f.ebitda, + f.ebit, f.nopi, f.spi, f.pi, f.txp, f.ni, f.txfed, f.txfo, f.txt, f.xint, f.xpp, f.xacc, + + /*CF statement and others*/ + f.capx, f.oancf, f.dvt, f.ob, f.gdwlia, f.gdwlip, f.gwo, f.mib, f.oiadp, f.ivao, f.ivst, + + /*assets*/ + f.rect, f.act, f.che, f.ppegt, f.invt, f.at, f.aco, f.intan, f.ao, f.ppent, f.gdwl, f.fatb, f.fatl, + + /*liabilities*/ + f.lct, f.dlc, f.dltt, f.lt, f.dm, f.dcvt, f.cshrc, + f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, f.dltis, f.dltr, f.dlcch, + + /*equity and other*/ + f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, + f.dpc, f.ajex, f.tstkp, f.oibdp, f.capxv, f.dvpa, f.epspx, + + /*market*/ + abs(f.prcc_f) as prcc_f, abs(f.prcc_c) as prcc_c, f.dvc, f.prstkc, f.sstk, f.fopt, f.wcap + + from comp.funda as f + left join comp.company as c + on f.gvkey = c.gvkey + + /*get consolidated, standardized, industrial format statements*/ + where f.indfmt = 'INDL' + and f.datafmt = 'STD' + and f.popsrc = 'D' + and f.consol = 'C' + and f.datadate >= '01/01/1959' + """) + +# convert datadate to date fmt +comp['datadate'] = pd.to_datetime(comp['datadate']) + +# sort and clean up +comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() + +# clean up csho +comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) + +# calculate Compustat market equity +comp['mve_f'] = comp['csho'] * comp['prcc_f'] + +# do some clean up. several variables have lots of missing values +condlist = [comp['drc'].notna() & comp['drlt'].notna(), + comp['drc'].notna() & comp['drlt'].isnull(), + comp['drlt'].notna() & comp['drc'].isnull()] +choicelist = [comp['drc']+comp['drlt'], + comp['drc'], + comp['drlt']] +comp['dr'] = np.select(condlist, choicelist, default=np.nan) + +condlist = [comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].notna() & comp['dcpstk'] > comp['pstk'], + comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].isnull()] +choicelist = [comp['dcpstk']-comp['pstk'], + comp['dcpstk']] +comp['dc'] = np.select(condlist, choicelist, default=np.nan) +comp['dc'] = np.where(comp['dc'].isnull(), comp['dcvt'], comp['dc']) + +comp['xint0'] = np.where(comp['xint'].isnull(), 0, comp['xint']) +comp['xsga0'] = np.where(comp['xsga'].isnull, 0, 0) + +comp['ceq'] = np.where(comp['ceq'] == 0, np.nan, comp['ceq']) +comp['at'] = np.where(comp['at'] == 0, np.nan, comp['at']) +comp = comp.dropna(subset=['at']) +print('compustat') +####################################################################################################################### +# CRSP Block # +####################################################################################################################### +# Create a CRSP Subsample with Monthly Stock and Event Variables +# Restrictions will be applied later +# Select variables from the CRSP monthly stock and event datasets +crsp = conn.raw_sql(""" + select a.prc, a.ret, a.retx, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.date, a.permno, a.permco, + b.ticker, b.ncusip, b.shrcd, b.exchcd + from crsp.msf as a + left join crsp.msenames as b + on a.permno=b.permno + and b.namedt<=a.date + and a.date<=b.nameendt + where a.date >= '01/01/1959' + and b.exchcd between 1 and 3 + """) + +# change variable format to int +crsp[['permco', 'permno', 'shrcd', 'exchcd']] = crsp[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) +crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month + +crsp = crsp.dropna(subset=['prc']) +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity + +# if Market Equity is Nan then let return equals to 0 +crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) +crsp['retx'] = np.where(crsp['me'].isnull(), 0, crsp['retx']) + +# impute me +crsp = crsp.sort_values(by=['permno', 'date']).drop_duplicates() +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) + +# Aggregate Market Cap +''' +There are cases when the same firm (permco) has two or more securities (permno) at same date. +For the purpose of ME for the firm, we aggregated all ME for a given permco, date. +This aggregated ME will be assigned to the permno with the largest ME. +''' +# sum of me across different permno belonging to same permco a given date +crsp_summe = crsp.groupby(['monthend', 'permco'])['me'].sum().reset_index() +# largest mktcap within a permco/date +crsp_maxme = crsp.groupby(['monthend', 'permco'])['me'].max().reset_index() +# join by monthend/maxme to find the permno +crsp1 = pd.merge(crsp, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) +# drop me column and replace with the sum me +crsp1 = crsp1.drop(['me'], axis=1) +# join with sum of me to get the correct market cap info +crsp2 = pd.merge(crsp1, crsp_summe, how='inner', on=['monthend', 'permco']) +# sort by permno and date and also drop duplicates +crsp2 = crsp2.sort_values(by=['permno', 'monthend']).drop_duplicates() +print('crsp') +####################################################################################################################### +# CCM Block # +####################################################################################################################### +# merge CRSP and Compustat +# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/linking-databases/linking-crsp-and-compustat/ +ccm = conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where substr(linktype,1,1)='L' + and (linkprim ='C' or linkprim='P') + """) + +ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) +ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) + +# if linkenddt is missing then set to today date +ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) + +# merge ccm and comp +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) + +# we can only get the accounting data after the firm public their report +# for annual data, we use 5 or 6 months lagged data +ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) +ccm1['jdate'] = ccm1['datadate'] + MonthEnd(4) + +# set link date bounds +ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] + +# link comp and crsp +crsp2 = crsp2.rename(columns={'monthend': 'jdate'}) +data_rawa = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) + +# filter exchcd & shrcd +data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & + ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] + +# process Market Equity +''' +Note: me is CRSP market equity, mve_f is Compustat market equity. Please choose the me below. +''' +data_rawa['me'] = data_rawa['me']/1000 # CRSP ME +# data_rawa['me'] = data_rawa['mve_f'] # Compustat ME + +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) +data_rawa = data_rawa.dropna(subset=['me']) + +# count single stock years +# data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() + +# deal with the duplicates +data_rawa.loc[data_rawa.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 +data_rawa = data_rawa[data_rawa['temp'].notna()] +data_rawa.loc[data_rawa.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 +data_rawa = data_rawa[data_rawa['temp'].notna()] + +data_rawa = data_rawa.sort_values(by=['permno', 'jdate']) +print('ccm') +####################################################################################################################### +# Annual Variables # +####################################################################################################################### +# stockholders' equity +data_rawa['se'] = np.where(data_rawa['seq'].isnull(), data_rawa['ceq']+data_rawa['pstk'], data_rawa['seq']) +data_rawa['se'] = np.where(data_rawa['se'].isnull(), data_rawa['at']-data_rawa['lt'], data_rawa['se']) + +data_rawa['txditc'] = data_rawa['txditc'].fillna(0) + +# preferrerd stock +data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps']) + +# book equity +data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] +data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) + +# acc +data_rawa['act_l1'] = data_rawa.groupby(['permno'])['act'].shift(1) +data_rawa['lct_l1'] = data_rawa.groupby(['permno'])['lct'].shift(1) + +condlist = [data_rawa['np'].isnull(), + data_rawa['act'].isnull() | data_rawa['lct'].isnull()] +choicelist = [((data_rawa['act']-data_rawa['lct'])-(data_rawa['act_l1']-data_rawa['lct_l1'])/(10*data_rawa['be'])), + (data_rawa['ib']-data_rawa['oancf'])/(10*data_rawa['be'])] +data_rawa['acc'] = np.select(condlist, + choicelist, + default=((data_rawa['act']-data_rawa['lct']+data_rawa['np'])- + (data_rawa['act_l1']-data_rawa['lct_l1']+data_rawa['np'].shift(1)))/(10*data_rawa['be'])) + +# agr +data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1) +data_rawa['agr'] = (data_rawa['at']-data_rawa['at_l1'])/data_rawa['at_l1'] + +# bm +# data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] + +# cfp +# condlist = [data_rawa['dp'].isnull(), +# data_rawa['ib'].isnull()] +# choicelist = [data_rawa['ib']/data_rawa['me'], +# np.nan] +# data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) + +# ep, checked from Hou and change 'ME' from compustat to crsp +#data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] +#data_rawa['ep_n'] = data_rawa['ib'] + +# ni +data_rawa['csho_l1'] = data_rawa.groupby(['permno'])['csho'].shift(1) +data_rawa['ajex_l1'] = data_rawa.groupby(['permno'])['ajex'].shift(1) +data_rawa['ni'] = np.where(data_rawa['gvkey'] != data_rawa['gvkey'].shift(1), + np.nan, + np.log(data_rawa['csho']*data_rawa['ajex']).replace(-np.inf, 0)- + np.log(data_rawa['csho_l1']*data_rawa['ajex_l1']).replace(-np.inf, 0)) + +# op: the formula seems different from Hou Page 74? +data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) +data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) +data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga']) + +condlist = [data_rawa['revt'].isnull(), data_rawa['be'].isnull()] +choicelist = [np.nan, np.nan] +data_rawa['op'] = np.select(condlist, choicelist, + default=(data_rawa['revt'] - data_rawa['cogs0'] - data_rawa['xsga0'] - data_rawa['xint0'])/data_rawa['be']) + + + +# rsup +data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) +# data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] + +# cash +data_rawa['cash'] = data_rawa['che']/data_rawa['at'] + +# lev +# data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] + +# sp, checked +# data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] +#data_rawa['sp_n'] = data_rawa['sale'] + +# rd_sale +data_rawa['rd_sale'] = data_rawa['xrd']/data_rawa['sale'] + +# rdm +# data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] + +# adm hxz adm, checked +# data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] + +# gma +data_rawa['gma'] = (data_rawa['revt']-data_rawa['cogs'])/data_rawa['at_l1'] + +# chcsho +data_rawa['chcsho'] = (data_rawa['csho']/data_rawa['csho_l1'])-1 + +# lgr +data_rawa['lt_l1'] = data_rawa.groupby(['permno'])['lt'].shift(1) +data_rawa['lgr'] = (data_rawa['lt']/data_rawa['lt_l1'])-1 + +# pctacc +data_rawa['che_l1'] = data_rawa.groupby(['permno'])['che'].shift(1) +data_rawa['dlc_l1'] = data_rawa.groupby(['permno'])['dlc'].shift(1) +data_rawa['txp_l1'] = data_rawa.groupby(['permno'])['txp'].shift(1) + +condlist = [data_rawa['ib']==0, + data_rawa['oancf'].isnull(), + data_rawa['oancf'].isnull() & data_rawa['ib']==0] +choicelist = [(data_rawa['ib']-data_rawa['oancf'])/0.01, + ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1']))- + ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1']- + ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))/data_rawa['ib'].abs(), + ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1'])) - + ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1'] - + ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))] +data_rawa['pctacc'] = np.select(condlist, choicelist, default=(data_rawa['ib']-data_rawa['oancf'])/data_rawa['ib'].abs()) + +# sgr +data_rawa['sgr'] = (data_rawa['sale']/data_rawa['sale_l1'])-1 + +# chato +data_rawa['at_l2'] = data_rawa.groupby(['permno'])['at'].shift(2) +data_rawa['chato'] = (data_rawa['sale']/((data_rawa['at']+data_rawa['at_l1'])/2))-\ + (data_rawa['sale_l1']/((data_rawa['at']+data_rawa['at_l2'])/2)) + +# chtx +data_rawa['txt_l1'] = data_rawa.groupby(['permno'])['txt'].shift(1) +data_rawa['chtx'] = (data_rawa['txt']-data_rawa['txt_l1'])/data_rawa['at_l1'] + +# noa,checked +data_rawa['noa'] = ((data_rawa['at']-data_rawa['che']-data_rawa['ivao'].fillna(0))- + (data_rawa['at']-data_rawa['dlc'].fillna(0)-data_rawa['dltt'].fillna(0)-data_rawa['mib'].fillna(0) + -data_rawa['pstk'].fillna(0)-data_rawa['ceq'])/data_rawa['at_l1']) + +# rna +data_rawa['noa_l1'] = data_rawa.groupby(['permno'])['noa'].shift(1) +data_rawa['rna'] = data_rawa['oiadp']/data_rawa['noa_l1'] + +# pm +data_rawa['pm'] = data_rawa['oiadp']/data_rawa['sale'] + +# ato +data_rawa['ato'] = data_rawa['sale']/data_rawa['noa_l1'] + +# depr +data_rawa['depr'] = data_rawa['dp']/data_rawa['ppent'] + +# invest +data_rawa['ppent_l1'] = data_rawa.groupby(['permno'])['ppent'].shift(1) +data_rawa['invt_l1'] = data_rawa.groupby(['permno'])['invt'].shift(1) + +data_rawa['invest'] = np.where(data_rawa['ppegt'].isnull(), ((data_rawa['ppent']-data_rawa['ppent_l1'])+ + (data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1'], + ((data_rawa['ppegt']-data_rawa['ppent_l1'])+(data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1']) + +# egr +data_rawa['ceq_l1'] = data_rawa.groupby(['permno'])['ceq'].shift(1) +data_rawa['egr'] = ((data_rawa['ceq']-data_rawa['ceq_l1'])/data_rawa['ceq_l1']) + +# cashdebt +data_rawa['cashdebt'] = (data_rawa['ib']+data_rawa['dp'])/((data_rawa['lt']+data_rawa['lt_l1'])/2) + +# rd +# if ((xrd/at)-(lag(xrd/lag(at))))/(lag(xrd/lag(at))) >.05 then rd=1 else rd=0 +data_rawa['xrd/at_l1'] = data_rawa['xrd']/data_rawa['at_l1'] +data_rawa['xrd/at_l1_l1'] = data_rawa.groupby(['permno'])['xrd/at_l1'].shift(1) +data_rawa['rd'] = np.where(((data_rawa['xrd']/data_rawa['at'])- + (data_rawa['xrd/at_l1_l1']))/data_rawa['xrd/at_l1_l1']>0.05, 1, 0) + +# roa +data_rawa['roa'] = data_rawa['ni']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# roe +data_rawa['roe'] = data_rawa['ib']/data_rawa['ceq_l1'] + +# dy +# data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] + +################## Added on 2020.07.28 ################## + +# roic +data_rawa['roic'] = (data_rawa['ebit'] - data_rawa['nopi'])/(data_rawa['ceq'] + data_rawa['lt'] - data_rawa['che']) + +# chinv +data_rawa['chinv'] = (data_rawa['invt'] - data_rawa['invt_l1'])/((data_rawa['at'] + data_rawa['at_l2'])/2) + +# pchsale_pchinvt +data_rawa['pchsale_pchinvt'] = ((data_rawa['sale'] - data_rawa['sale_l1'])/data_rawa['sale_l1'])\ + - ((data_rawa['invt']-data_rawa['invt_l1'])/data_rawa['invt_l1']) + +# pchsale_pchrect +data_rawa['rect_l1'] = data_rawa.groupby(['permno'])['rect'].shift(1) +data_rawa['pchsale_pchrect'] = ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale_l1'])\ + - ((data_rawa['rect']-data_rawa['rect_l1'])/data_rawa['rect_l1']) + +# pchgm_pchsale +data_rawa['cogs_l1'] = data_rawa.groupby(['permno'])['cogs'].shift(1) +data_rawa['pchgm_pchsale'] = (((data_rawa['sale']-data_rawa['cogs']) + - (data_rawa['sale_l1']-data_rawa['cogs_l1']))/(data_rawa['sale_l1']-data_rawa['cogs_l1']))\ + - ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale']) + +# pchsale_pchxsga +data_rawa['xsga_l1'] = data_rawa.groupby(['permno'])['xsga'].shift(1) +data_rawa['pchsale_pchxsga'] = ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale_l1'])\ + - ((data_rawa['xsga']-data_rawa['xsga_l1'])/data_rawa['xsga_l1']) + +# pchdepr +data_rawa['dp_l1'] = data_rawa.groupby(['permno'])['dp'].shift(1) +data_rawa['pchdepr'] = ((data_rawa['dp']/data_rawa['ppent'])-(data_rawa['dp_l1'] + /data_rawa['ppent_l1']))\ + / (data_rawa['dp_l1']/data_rawa['ppent']) + +# chadv +data_rawa['xad_l1'] = data_rawa.groupby(['permno'])['xad'].shift(1) +data_rawa['chadv'] = np.log(data_rawa['xad'] + 1) - np.log(data_rawa['xad_l1'] + 1) + +# pchcapx +data_rawa['capx_l1'] = data_rawa.groupby(['permno'])['capx'].shift(1) +data_rawa['pchcapx'] = (data_rawa['capx']-data_rawa['capx_l1'])/data_rawa['capx_l1'] + +# grcapx +data_rawa['capx_l2'] = data_rawa.groupby(['permno'])['capx'].shift(2) +data_rawa['grcapx'] = (data_rawa['capx']-data_rawa['capx_l2'])/data_rawa['capx_l2'] + +# grGW +data_rawa['gdwl_l1'] = data_rawa.groupby(['permno'])['gdwl'].shift(1) +data_rawa['grGW'] = (data_rawa['gdwl']-data_rawa['gdwl_l1'])/data_rawa['gdwl'] +condlist = [(data_rawa['gdwl']==0) | (data_rawa['gdwl'].isnull()), + (data_rawa['gdwl'].notna()) & (data_rawa['gdwl'] != 0) & (data_rawa['grGW'].isnull())] +choicelist = [0, 1] +data_rawa['grGW'] = np.select(condlist, choicelist, default=data_rawa['grGW']) + +# currat +data_rawa['currat'] = data_rawa['act']/data_rawa['lct'] + +# pchcurrat +data_rawa['pchcurrat'] = ((data_rawa['act']/data_rawa['lct'])-(data_rawa['act_l1']/data_rawa['lct_l1']))\ + /(data_rawa['act_l1']/data_rawa['lct_l1']) + +# quick +data_rawa['quick'] = (data_rawa['act']-data_rawa['invt'])/data_rawa['lct'] + +# pchquick +data_rawa['pchquick'] = ((data_rawa['act']-data_rawa['invt'])/data_rawa['lct'] + -(data_rawa['act_l1']-data_rawa['invt_l1'])/data_rawa['lct_l1'])\ + /((data_rawa['act_l1']-data_rawa['invt_l1'])/data_rawa['lct_l1']) + +# salecash +data_rawa['salecash'] = data_rawa['sale']/data_rawa['che'] + +# salerec +data_rawa['salerec']= data_rawa['sale']/data_rawa['rect'] + +# saleinv +data_rawa['saleinv'] = data_rawa['sale']/data_rawa['invt'] + +# pchsaleinv +data_rawa['pchsaleinv'] = ((data_rawa['sale']/data_rawa['invt'])-(data_rawa['sale_l1']/data_rawa['invt_l1']))\ + /(data_rawa['sale_l1']/data_rawa['invt_l1']) + +# realestate +data_rawa['realestate'] = (data_rawa['fatb']+data_rawa['fatl'])/data_rawa['ppegt'] +data_rawa['realestate'] = np.where(data_rawa['ppegt'].isnull(), + (data_rawa['fatb']+data_rawa['fatl'])/data_rawa['ppent'], data_rawa['realestate']) + +# obklg +data_rawa['obklg'] = data_rawa['ob']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# chobklg +data_rawa['ob_l1'] = data_rawa.groupby(['permno'])['ob'].shift(1) +data_rawa['chobklg'] = (data_rawa['ob'] - data_rawa['ob_l1'])/((data_rawa['at']+data_rawa['at_l1'])/2) + +# grltnoa +data_rawa['aco_l1'] = data_rawa.groupby(['permno'])['aco'].shift(1) +data_rawa['intan_l1'] = data_rawa.groupby(['permno'])['intan'].shift(1) +data_rawa['ao_l1'] = data_rawa.groupby(['permno'])['ao'].shift(1) +data_rawa['ap_l1'] = data_rawa.groupby(['permno'])['ap'].shift(1) +data_rawa['lco_l1'] = data_rawa.groupby(['permno'])['lco'].shift(1) +data_rawa['lo_l1'] = data_rawa.groupby(['permno'])['lo'].shift(1) +data_rawa['rect_l1'] = data_rawa.groupby(['permno'])['rect'].shift(1) + +data_rawa['grltnoa'] = ((data_rawa['rect']+data_rawa['invt']+data_rawa['ppent']+data_rawa['aco']+data_rawa['intan']+ + data_rawa['ao']-data_rawa['ap']-data_rawa['lco']-data_rawa['lo']) + -(data_rawa['rect_l1']+data_rawa['invt_l1']+data_rawa['ppent_l1']+data_rawa['aco_l1'] + +data_rawa['intan_l1']+data_rawa['ao_l1']-data_rawa['ap_l1']-data_rawa['lco_l1'] + -data_rawa['lo_l1']) + -(data_rawa['rect']-data_rawa['rect_l1']+data_rawa['invt']-data_rawa['invt_l1'] + +data_rawa['aco']-data_rawa['aco_l1'] + -(data_rawa['ap']-data_rawa['ap_l1']+data_rawa['lco']-data_rawa['lco_l1'])-data_rawa['dp']))\ + /((data_rawa['at']+data_rawa['at_l1'])/2) + +# conv +data_rawa['conv'] = data_rawa['dc']/data_rawa['dltt'] + +# chdrc +data_rawa['dr_l1'] = data_rawa.groupby(['permno'])['dr'].shift(1) +data_rawa['chdrc'] = (data_rawa['dr']-data_rawa['dr_l1'])/((data_rawa['at']+data_rawa['at_l1'])/2) + +# rdbias +data_rawa['xrd_l1'] = data_rawa.groupby(['permno'])['xrd'].shift(1) +data_rawa['rdbias'] = (data_rawa['xrd']/data_rawa['xrd_l1'])-1-data_rawa['ib']/data_rawa['ceq_l1'] + +# operprof +data_rawa['operprof'] = (data_rawa['revt']-data_rawa['cogs']-data_rawa['xsga0']-data_rawa['xint0'])/data_rawa['ceq_l1'] + +# cfroa +data_rawa['cfroa'] = data_rawa['oancf']/((data_rawa['at']+data_rawa['at_l1'])/2) +data_rawa['cfroa'] = np.where(data_rawa['oancf'].isnull(), + (data_rawa['ib'] + data_rawa['dp'])/((data_rawa['at']+data_rawa['at_l1'])/2), + data_rawa['cfroa']) + +# xrdint +data_rawa['xrdint'] = data_rawa['xrd']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# capxint +data_rawa['capxint'] = data_rawa['capx']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# xadint +data_rawa['xadint'] = data_rawa['xad']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# chpm +data_rawa['ib_l1'] = data_rawa.groupby(['permno'])['ib'].shift(1) +data_rawa['chpm'] = (data_rawa['ib']/data_rawa['sale'])-(data_rawa['ib_l1']/data_rawa['sale_l1']) + +# ala +data_rawa['ala'] = data_rawa['che']+0.75*(data_rawa['act']-data_rawa['che'])-\ + 0.5*(data_rawa['at']-data_rawa['act']-data_rawa['gdwl']-data_rawa['intan']) + +# alm +data_rawa['alm'] = data_rawa['ala']/(data_rawa['at']+data_rawa['prcc_f']*data_rawa['csho']-data_rawa['ceq']) + +# hire +data_rawa['emp_l1'] = data_rawa.groupby(['permno'])['emp'].shift(1) +data_rawa['hire'] = (data_rawa['emp'] - data_rawa['emp_l1'])/data_rawa['emp_l1'] +data_rawa['hire'] = np.where((data_rawa['emp'].isnull()) | (data_rawa['emp_l1'].isnull()), 0, data_rawa['hire']) + +# herf +data_rawa['sic'] = data_rawa['sic'].astype(int) +data_rawa['ffi49'] = ffi49(data_rawa) +data_rawa['ffi49'] = data_rawa['ffi49'].fillna(49) +data_rawa['ffi49'] = data_rawa['ffi49'].astype(int) +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['sale'].sum() +df_temp = df_temp.rename(columns={'sale': 'indsale'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +data_rawa['herf'] = (data_rawa['sale']/data_rawa['indsale'])*(data_rawa['sale']/data_rawa['indsale']) +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['herf'].sum() +data_rawa = data_rawa.drop(['herf'], axis=1) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) + +################################## Added on 2020.10.29 ################################## +# Bmj +data_rawa['be_per'] = data_rawa['be'] / data_rawa['csho'] +data_rawa['bmj'] = data_rawa['be_per'] / data_rawa['prc'] +############### *Q*: used prc as share price from crsp ########## + +# Cp +data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] +#data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'] + +# Dp +###### *Q* difference return with without divident + +# Dur +# me = data_rawa['me_comp'] + + +# Ebp +data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) +data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) +data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] +data_rawa['f_asse'] = data_rawa['che'] +# net debt : = financial liabilities - financial assets. +data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] +data_rawa['ber'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa'] +#data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['ber']) / (data_rawa['n_debt']+data_rawa['me']) + + +# Em +#data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che'] +#data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'] + +############### Investment ############### +# Aci +data_rawa['ce'] = data_rawa['capx'] / data_rawa['sale'] +data_rawa['ce1'] = data_rawa['ce'].shift(1) +data_rawa['ce2'] = data_rawa['ce'].shift(2) +data_rawa['ce3'] = data_rawa['ce'].shift(3) +data_rawa['aci'] = data_rawa['ce']/ (data_rawa['ce1']+data_rawa['ce2']+data_rawa['ce3'])-1 + +# Cei +#data_rawa['lg_me'] = np.log(data_rawa['me']/data_rawa['me'].shift(6)) +#data_rawa['lg_ret'] = np.log(data_rawa['ret']*data_rawa['ret'].shift(1)*data_rawa['ret'].shift(2)*data_rawa['ret'].shift(3)*data_rawa['ret'].shift(5)*data_rawa['ret'].shift(6)) +#data_rawa['cei'] = data_rawa['lg_me'] - data_rawa['lg_ret'] + + +# Dac + + + +# dCoa +data_rawa['coa'] = data_rawa['act'] - data_rawa['che'] +data_rawa['dcoa'] = (data_rawa['coa']-data_rawa['coa'].shift(1)) / data_rawa['at'].shift(1) + + +# dBe +data_rawa['dBe'] = (data_rawa['ceq'] - data_rawa['ceq'].shift(1)) / data_rawa['at'].shift(1) + + +# dFnl & dFin +data_rawa['fna'] = data_rawa['ivst'] + data_rawa['ivao'] +data_rawa['fnl'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + +data_rawa['d_dlc'] = data_rawa['dlc'] - data_rawa['dlc'].shift(1) +data_rawa['d_dlc'] = np.where(data_rawa['d_dlc'].isnull(), 0, data_rawa['d_dlc']) +data_rawa['d_pstk'] = data_rawa['pstk'] - data_rawa['pstk'].shift(1) +data_rawa['d_pstk'] = np.where(data_rawa['d_pstk'].isnull(), 0, data_rawa['d_pstk']) + +data_rawa['dfnl'] = (data_rawa['dltt']-data_rawa['dltt'].shift(1)) + data_rawa['d_dlc'] + data_rawa['d_pstk'] + +data_rawa['d_ivst'] = data_rawa['ivst'] - data_rawa['ivst'].shift(1) +data_rawa['d_ivst'] = np.where(data_rawa['d_ivst'].isnull(), 0, data_rawa['d_ivst']) +data_rawa['d_ivao'] = data_rawa['ivao'] - data_rawa['ivao'].shift(1) +data_rawa['d_ivao'] = np.where(data_rawa['d_ivao'].isnull(), 0, data_rawa['d_ivao']) + +data_rawa['dfna'] = data_rawa['d_ivst'] + data_rawa['d_ivao'] +data_rawa['dfin'] = data_rawa['dfna'] - data_rawa['dfnl'] + +data_rawa['dfin'] = data_rawa['dfin'] / data_rawa['at'].shift(1) +data_rawa['dfnl'] = data_rawa['dfnl'] / data_rawa['at'].shift(1) + + + + +# dIi +data_rawa['e_invt'] = (data_rawa['capxv'] + data_rawa['capxv'].shift(1))/2 +data_rawa['dinvt'] = (data_rawa['capxv'] - data_rawa['e_invt']) / data_rawa['e_invt'] + +data_rawa['ind'] = data_rawa['capxv'] +s = data_rawa.groupby(['jdate', 'sic2'])['ind'].sum() +data_rawa = pd.merge(data_rawa, s, on=['jdate', 'sic2']) +# new industry investment will be named as ind_y, cause it's been grouped by ind +data_rawa['e_ind'] = (data_rawa['ind_y'] + data_rawa['ind_y'].shift(1))/2 +data_rawa['dind'] = (data_rawa['ind_y']-data_rawa['e_ind']) / data_rawa['e_ind'] +data_rawa['dIi'] = data_rawa['dinvt'] - data_rawa['dind'] + +# dLno +data_rawa['dlno'] = (data_rawa['ppent']-data_rawa['ppent'].shift(1)) + (data_rawa['intan']-data_rawa['intan'].shift(1)) + (data_rawa['ao']-data_rawa['ao'].shift(1)) - (data_rawa['lo']-data_rawa['lo'].shift(1)) + data_rawa['dp'] +avg_at = [] +for i in range(data_rawa.shape[0]): + avg_at.append(data_rawa.loc[0:i, 'at'].mean()) +data_rawa['avg_at'] = pd.DataFrame(avg_at) +data_rawa['dlno'] = data_rawa['dlno'] / data_rawa['avg_at'] + + +# dNco +data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao'] +data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt'] +data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] +data_rawa['dnco'] = data_rawa['nco'] - data_rawa['nco'].shift(1) + + +# dNca +data_rawa['ivao_0'] = np.where(data_rawa['ivao'].isnull(), 0, data_rawa['ivao']) +data_rawa['dltt_0'] = np.where(data_rawa['dltt'].isnull(), 0, data_rawa['dltt']) + +data_rawa['nca'] = data_rawa['at'] - data_rawa['act'] - data_rawa['ivao_0'] +data_rawa['ncl'] = data_rawa['lt'] - data_rawa['lct'] - data_rawa['dltt_0'] +data_rawa['nco'] = data_rawa['nca'] - data_rawa['ncl'] +data_rawa['dnca'] = data_rawa['nco'] - data_rawa['nco'].shift(1) + + + +# dNoa +data_rawa['dlc_0'] = np.where(data_rawa['dlc'].isnull(), 0, data_rawa['dlc']) +data_rawa['mib_0'] = np.where(data_rawa['mib'].isnull(), 0, data_rawa['mib']) +data_rawa['pstk_0'] = np.where(data_rawa['pstk'].isnull(), 0, data_rawa['pstk']) + +data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] +data_rawa['op_lia'] = data_rawa['at'] - data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] +data_rawa['net_op'] = data_rawa['op_at'] - data_rawa['op_lia'] +data_rawa['dnoa'] = (data_rawa['net_op']-data_rawa['net_op'].shift(1))/ data_rawa['at'].shift(1) + + +# dPia +data_rawa['c_propty'] = data_rawa['ppegt'] - data_rawa['ppegt'].shift(1) +data_rawa['c_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) +data_rawa['dpia'] = (data_rawa['c_propty'] + data_rawa['c_invt']) / data_rawa['at'].shift(1) + + + + + +######### Profitability ########## +# Ato,repeated +#data_rawa['op_at'] = data_rawa['at'] - data_rawa['che'] - data_rawa['ivao_0'] +#data_rawa['op_lia'] = data_rawa['dlc_0'] - data_rawa['dltt_0'] - data_rawa['mib_0'] - data_rawa['pstk_0'] - data_rawa['ceq'] +#data_rawa['noa'] = data_rawa['op_at'] - data_rawa['op_lia'] +#data_rawa['ato'] = data_rawa['sale'] / data_rawa['noa'].shift(1) + + +# Cla +data_rawa['d_rect'] = data_rawa['rect'] - data_rawa['rect'].shift(1) +data_rawa['d_invt'] = data_rawa['invt'] - data_rawa['invt'].shift(1) +data_rawa['d_xpp'] = data_rawa['xpp'] - data_rawa['xpp'].shift(1) +data_rawa['d_dr'] = (data_rawa['drc']-data_rawa['drc'].shift(1)) + (data_rawa['drlt']-data_rawa['drlt'].shift(1)) +data_rawa['d_ap'] = data_rawa['ap'] - data_rawa['ap'].shift(1) +data_rawa['d_xacc'] = data_rawa['xacc'] - data_rawa['xacc'].shift(1) + +data_rawa['xrd_0'] = np.where(data_rawa['xrd'].isnull(), 0, data_rawa['xrd']) +data_rawa['d_rect_0'] = np.where(data_rawa['d_rect'].isnull(), 0, data_rawa['d_rect']) +data_rawa['d_invt_0'] = np.where(data_rawa['d_invt'].isnull(), 0, data_rawa['d_invt']) +data_rawa['d_xpp_0'] = np.where(data_rawa['d_xpp'].isnull(), 0, data_rawa['d_xpp']) +data_rawa['d_dr_0'] = np.where(data_rawa['d_dr'].isnull(), 0, data_rawa['d_dr']) +data_rawa['d_ap_0'] = np.where(data_rawa['d_ap'].isnull(), 0, data_rawa['d_ap']) +data_rawa['d_xacc_0'] = np.where(data_rawa['d_xacc'].isnull(), 0, data_rawa['d_xacc']) + +data_rawa['cla'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ + - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ + + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] +data_rawa['cla'] = data_rawa['cla'] / data_rawa['at'].shift(1) + + +# Cop +data_rawa['cop'] = data_rawa['revt'] - data_rawa['cogs'] - data_rawa['xsga'] + data_rawa['xrd_0']\ + - data_rawa['d_rect_0'] - data_rawa['d_invt_0'] - data_rawa['d_xpp_0']\ + + data_rawa['d_dr_0'] + data_rawa['d_ap_0'] + data_rawa['d_xacc_0'] +data_rawa['cop'] = data_rawa['cop'] / data_rawa['at'] + + +# Cto +data_rawa['cto'] = data_rawa['sale'] / data_rawa['at'].shift(1) + +#ir +''' +#First calculate r(t-5,t). Then rb(t-5,t) and use Bm to perform linear regression and get residue +''' +#r(t-5,t):sum ret from t-5 to t (which is calendar year t-6 to t-1) +lag = pd.DataFrame() +for i in range(1,6): + lag['ret%s' % i] = data_rawa.groupby(['permno'])['ret'].shift(i) + +data_rawa['ret5'] = lag['ret1']+lag['ret2']+lag['ret3']+lag['ret4']+lag['ret5'] + +#bm_t-5 (bm of year t-5) +#data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) + +#rB (five year log book return) +#Reference: jf_06 page8 by KENT DANIEL +#data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] + +#Regression and get ir +#First get unique datelist +#datelist = data_rawa['jdate'].unique() +#for date in datelist: +# temp = data_rawa['jdate' == date] +# n_row = temp.shape[0] +# index = temp.index +# X = pd.DataFrame() +# X['bm5'] = temp['bm5'] +# X['rB'] = temp['rB'] +# X['intercept'] = 1 +# X = X[['intercept','rB','bm5']] +# X = np.mat(X) +# Y = np.mat(temp[['ret5']]) + #These are residuals on one date +# res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) +# #put residuals back into data_rawa +# data_rawa.loc[index,'ir'] = res + +#nop +#data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] +#data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] +#data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] ) + +#ocp +#data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf']) +#data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] +#data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) + +#dwc +data_rawa['dwc'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc']) +#data_rawa['dwc'] = data_rawa['dwc']/data_rawa['at_l1'] + +#I/A +data_rawa['ia'] = (data_rawa['at']/data_rawa['at_l1'])-1 + +#Ig +data_rawa['capx_l1'] = data_rawa.groupby('permno')['capx'].shift(1) +data_rawa['ig'] = data_rawa['capx']/data_rawa['capx_l1'] + +#2Ig +data_rawa['capx_l2'] = data_rawa.groupby('permno')['capx'].shift(2) +data_rawa['2ig'] = data_rawa['capx']/data_rawa['capx_l2'] + +#Ivc +data_rawa['atAvg'] = (data_rawa['at']+data_rawa['at_l1'])/2 +data_rawa['ivc'] = data_rawa['invt'] / data_rawa['atAvg'] + +#Ndf +data_rawa['ndf'] = data_rawa['dltis'] - data_rawa['dltr'] + data_rawa['dlcch'] + +#nsi +data_rawa['sps'] = data_rawa['csho'] * data_rawa['ajex'] +data_rawa['sps_l1'] = data_rawa.groupby('permno')['sps'].shift(1) +data_rawa['nsi'] = np.log(data_rawa['sps']/data_rawa['sps_l1']) + +#oa +data_rawa['txp'] = np.where(data_rawa['txp'].isnull(), 0, data_rawa['txp']) +data_rawa['oa'] = (data_rawa['act'] - data_rawa['che']) - (data_rawa['lct'] - data_rawa['dlc'] - data_rawa['txp']) - data_rawa['dp'] + +#Poa +data_rawa['poa'] = data_rawa['oa']/data_rawa['ni'] + +#Ta +data_rawa['ta'] = data_rawa['dwc'] + data_rawa['dnco'] + data_rawa['dfin'] + +#Ol +data_rawa['ol'] = (data_rawa['cogs'] + data_rawa['xsga'])/data_rawa['at'] + +#etr +data_rawa['txtpi'] = data_rawa['txt'] / data_rawa['pi'] +data_rawa['txtpi_l1'] = data_rawa.groupby('permno')['txtpi'].shift(1) +data_rawa['txtpi_l2'] = data_rawa.groupby('permno')['txtpi'].shift(2) +data_rawa['txtpi_l3'] = data_rawa.groupby('permno')['txtpi'].shift(3) +data_rawa['deps'] = data_rawa['epspx']/(data_rawa['ajex'] * data_rawa['prcc_f']) +data_rawa['etr'] = (data_rawa['txtpi'] - (data_rawa['txtpi_l1'] + data_rawa['txtpi_l2'] + data_rawa['txtpi_l3'])/3) * data_rawa['deps'] + +print('annual') +####################################################################################################################### +# Compustat Quarterly Raw Info # +####################################################################################################################### +comp = conn.raw_sql(""" + /*header info*/ + select c.gvkey, f.cusip, f.datadate, f.fyearq, substr(c.sic,1,2) as sic2, c.sic, f.fqtr, f.rdq, + + /*income statement*/ + f.ibq, f.saleq, f.txtq, f.revtq, f.cogsq, f.xsgaq, f.revty, f.cogsy, f.saley, + + /*balance sheet items*/ + f.atq, f.actq, f.cheq, f.lctq, f.dlcq, f.ppentq, f.ppegtq, + + /*others*/ + abs(f.prccq) as prccq, abs(f.prccq)*f.cshoq as mveq_f, f.ceqq, f.seqq, f.pstkq, f.ltq, + f.pstkrq, f.gdwlq, f.intanq, f.mibq, f.oiadpq, f.ivaoq, + + /* v3 my formula add*/ + f.ajexq, f.cshoq, f.txditcq, f.npq, f.xrdy, f.xrdq, f.dpq, f.xintq, f.invtq, f.scstkcy, f.niq, + f.oancfy, f.dlttq, f.rectq, f.acoq, f.apq, f.lcoq, f.loq, f.aoq + + from comp.fundq as f + left join comp.company as c + on f.gvkey = c.gvkey + + /*get consolidated, standardized, industrial format statements*/ + where f.indfmt = 'INDL' + and f.datafmt = 'STD' + and f.popsrc = 'D' + and f.consol = 'C' + and f.datadate >= '01/01/1959' + """) + +# comp['cusip6'] = comp['cusip'].str.strip().str[0:6] +comp = comp.dropna(subset=['ibq']) + +# sort and clean up +comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() +comp['cshoq'] = np.where(comp['cshoq'] == 0, np.nan, comp['cshoq']) +comp['ceqq'] = np.where(comp['ceqq'] == 0, np.nan, comp['ceqq']) +comp['atq'] = np.where(comp['atq'] == 0, np.nan, comp['atq']) +comp = comp.dropna(subset=['atq']) + +# convert datadate to date fmt +comp['datadate'] = pd.to_datetime(comp['datadate']) + +# merge ccm and comp +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) +ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) +ccm1['jdate'] = ccm1['datadate'] + MonthEnd(3) # we change quarterly lag here +# ccm1['jdate'] = ccm1['datadate']+MonthEnd(4) + +# set link date bounds +ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] + +# merge ccm2 and crsp2 +# crsp2['jdate'] = crsp2['monthend'] +data_rawq = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) + +# filter exchcd & shrcd +data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & + ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] + +# process Market Equity +''' +Note: me is CRSP market equity, mveq_f is Compustat market equity. Please choose the me below. +''' +data_rawq['me'] = data_rawq['me']/1000 # CRSP ME +# data_rawq['me'] = data_rawq['mveq_f'] # Compustat ME + +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) +data_rawq = data_rawq.dropna(subset=['me']) + +# count single stock years +# data_rawq['count'] = data_rawq.groupby(['gvkey']).cumcount() + +# deal with the duplicates +data_rawq.loc[data_rawq.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 +data_rawq = data_rawq[data_rawq['temp'].notna()] +data_rawq.loc[data_rawq.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 +data_rawq = data_rawq[data_rawq['temp'].notna()] + +data_rawq = data_rawq.sort_values(by=['permno', 'jdate']) +print('quarterly raw') +####################################################################################################################### +# Quarterly Variables # +####################################################################################################################### +# prepare be +data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) +data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) + +# dy +# data_rawq['me_l1'] = data_rawq.groupby(['permno'])['me'].shift(1) +# data_rawq['retdy'] = data_rawq['ret'] - data_rawq['retx'] +# data_rawq['mdivpay'] = data_rawq['retdy']*data_rawq['me_l1'] +# +# data_rawq['dy'] = ttm12(series='mdivpay', df=data_rawq)/data_rawq['me'] + +# chtx +data_rawq['txtq_l4'] = data_rawq.groupby(['permno'])['txtq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) +data_rawq['chtx'] = (data_rawq['txtq']-data_rawq['txtq_l4'])/data_rawq['atq_l4'] + +# roa +data_rawq['atq_l1'] = data_rawq.groupby(['permno'])['atq'].shift(1) +data_rawq['roa'] = data_rawq['ibq']/data_rawq['atq_l1'] + +# cash +data_rawq['cash'] = data_rawq['cheq']/data_rawq['atq'] + +# acc +data_rawq['actq_l4'] = data_rawq.groupby(['permno'])['actq'].shift(4) +data_rawq['lctq_l4'] = data_rawq.groupby(['permno'])['lctq'].shift(4) +data_rawq['npq_l4'] = data_rawq.groupby(['permno'])['npq'].shift(4) +condlist = [data_rawq['npq'].isnull(), + data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] +choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/(10*data_rawq['beq']), + np.nan] +data_rawq['acc'] = np.select(condlist, choicelist, + default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])- + (data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/(10*data_rawq['beq'])) + +# bm +# data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] + +# cfp +data_rawq['ibq4'] = ttm4('ibq', data_rawq) +data_rawq['dpq4'] = ttm4('dpq', data_rawq) +# data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), +# data_rawq['ibq4']/data_rawq['me'], +# (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) + +# ep +# data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] + +# agr +data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'] + +# ni +data_rawq['cshoq_l4'] = data_rawq.groupby(['permno'])['cshoq'].shift(4) +data_rawq['ajexq_l4'] = data_rawq.groupby(['permno'])['ajexq'].shift(4) +data_rawq['ni'] = np.where(data_rawq['cshoq'].isnull(), np.nan, + np.log(data_rawq['cshoq']*data_rawq['ajexq']).replace(-np.inf, 0)-np.log(data_rawq['cshoq_l4']*data_rawq['ajexq_l4'])) + +# op +data_rawq['xintq0'] = np.where(data_rawq['xintq'].isnull(), 0, data_rawq['xintq']) +data_rawq['xsgaq0'] = np.where(data_rawq['xsgaq'].isnull(), 0, data_rawq['xsgaq']) +data_rawq['beq_l4'] = data_rawq.groupby(['permno'])['beq'].shift(4) + +data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] + +# csho +data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1 + +# cashdebt +data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4) +data_rawq['cashdebt'] = (ttm4('ibq', data_rawq) + ttm4('dpq', data_rawq))/((data_rawq['ltq']+data_rawq['ltq_l4'])/2) + +# rd +data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) +data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4']) + +data_rawq['xrdq4/atq_l4'] = data_rawq['xrdq4']/data_rawq['atq_l4'] +data_rawq['xrdq4/atq_l4_l4'] = data_rawq.groupby(['permno'])['xrdq4/atq_l4'].shift(4) +data_rawq['rd'] = np.where(((data_rawq['xrdq4']/data_rawq['atq'])-data_rawq['xrdq4/atq_l4_l4'])/data_rawq['xrdq4/atq_l4_l4']>0.05, 1, 0) + +# pctacc +condlist = [data_rawq['npq'].isnull(), + data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] +choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/abs(ttm4('ibq', data_rawq)), np.nan] +data_rawq['pctacc'] = np.select(condlist, choicelist, + default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/ + abs(ttm4('ibq', data_rawq))) + +# gma +data_rawq['revtq4'] = ttm4('revtq', data_rawq) +data_rawq['cogsq4'] = ttm4('cogsq', data_rawq) +data_rawq['gma'] = (data_rawq['revtq4']-data_rawq['cogsq4'])/data_rawq['atq_l4'] + +# lev +# data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] + +# rdm +# data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] + +# sgr +data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) + +data_rawq['saleq4_l4'] = data_rawq.groupby(['permno'])['saleq4'].shift(4) +data_rawq['sgr'] = (data_rawq['saleq4']/data_rawq['saleq4_l4'])-1 + +# sp +# data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] + +# invest +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) +data_rawq['ppegtq_l4'] = data_rawq.groupby(['permno'])['ppegtq'].shift(4) + +data_rawq['invest'] = np.where(data_rawq['ppegtq'].isnull(), ((data_rawq['ppentq']-data_rawq['ppentq_l4'])+ + (data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4'], + ((data_rawq['ppegtq']-data_rawq['ppegtq_l4'])+(data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4']) + +# rd_sale +data_rawq['rd_sale'] = data_rawq['xrdq4']/data_rawq['saleq4'] + +# lgr +data_rawq['lgr'] = (data_rawq['ltq']/data_rawq['ltq_l4'])-1 + +# depr +data_rawq['depr'] = ttm4('dpq', data_rawq)/data_rawq['ppentq'] + +# egr +data_rawq['ceqq_l4'] = data_rawq.groupby(['permno'])['ceqq'].shift(4) +data_rawq['egr'] = (data_rawq['ceqq']-data_rawq['ceqq_l4'])/data_rawq['ceqq_l4'] + +# chpm +data_rawq['ibq4_l1'] = data_rawq.groupby(['permno'])['ibq4'].shift(1) +data_rawq['saleq4_l1'] = data_rawq.groupby(['permno'])['saleq4'].shift(1) + +data_rawq['chpm'] = (data_rawq['ibq4']/data_rawq['saleq4'])-(data_rawq['ibq4_l1']/data_rawq['saleq4_l1']) + +# chato +data_rawq['atq_l8'] = data_rawq.groupby(['permno'])['atq'].shift(8) +data_rawq['chato'] = (data_rawq['saleq4']/((data_rawq['atq']+data_rawq['atq_l4'])/2))-(data_rawq['saleq4_l4']/((data_rawq['atq_l4']+data_rawq['atq_l8'])/2)) + +# noa +data_rawq['ivaoq'] = np.where(data_rawq['ivaoq'].isnull(), 0, 1) +data_rawq['dlcq'] = np.where(data_rawq['dlcq'].isnull(), 0, 1) +data_rawq['dlttq'] = np.where(data_rawq['dlttq'].isnull(), 0, 1) +data_rawq['mibq'] = np.where(data_rawq['mibq'].isnull(), 0, 1) +data_rawq['pstkq'] = np.where(data_rawq['pstkq'].isnull(), 0, 1) +data_rawq['noa'] = (data_rawq['atq']-data_rawq['cheq']-data_rawq['ivaoq'])-\ + (data_rawq['atq']-data_rawq['dlcq']-data_rawq['dlttq']-data_rawq['mibq']-data_rawq['pstkq']-data_rawq['ceqq'])/data_rawq['atq_l4'] + +# rna +data_rawq['noa_l4'] = data_rawq.groupby(['permno'])['noa'].shift(4) +data_rawq['rna'] = data_rawq['oiadpq']/data_rawq['noa_l4'] + +# pm +data_rawq['pm'] = data_rawq['oiadpq']/data_rawq['saleq'] + +# ato +data_rawq['ato'] = data_rawq['saleq']/data_rawq['noa_l4'] + +# roe +data_rawq['ceqq_l1'] = data_rawq.groupby(['permno'])['ceqq'].shift(1) +data_rawq['roe'] = data_rawq['ibq']/data_rawq['ceqq_l1'] + +################################## New Added ################################## + +# grltnoa +data_rawq['rectq_l4'] = data_rawq.groupby(['permno'])['rectq'].shift(4) +data_rawq['acoq_l4'] = data_rawq.groupby(['permno'])['acoq'].shift(4) +data_rawq['apq_l4'] = data_rawq.groupby(['permno'])['apq'].shift(4) +data_rawq['lcoq_l4'] = data_rawq.groupby(['permno'])['lcoq'].shift(4) +data_rawq['loq_l4'] = data_rawq.groupby(['permno'])['loq'].shift(4) +data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) + +data_rawq['grltnoa'] = ((data_rawq['rectq']+data_rawq['invtq']+data_rawq['ppentq']+data_rawq['acoq']+data_rawq['intanq']+ + data_rawq['aoq']-data_rawq['apq']-data_rawq['lcoq']-data_rawq['loq'])- + (data_rawq['rectq_l4']+data_rawq['invtq_l4']+data_rawq['ppentq_l4']+data_rawq['acoq_l4']-data_rawq['apq_l4']-data_rawq['lcoq_l4']-data_rawq['loq_l4'])-\ + (data_rawq['rectq']-data_rawq['rectq_l4']+data_rawq['invtq']-data_rawq['invtq_l4']+data_rawq['acoq']- + (data_rawq['apq']-data_rawq['apq_l4']+data_rawq['lcoq']-data_rawq['lcoq_l4'])- + ttm4('dpq', data_rawq)))/((data_rawq['atq']+data_rawq['atq_l4'])/2) + +# scal +# condlist = [data_rawq['seqq'].isnull(), +# data_rawq['seqq'].isnull() & (data_rawq['ceqq'].isnull() | data_rawq['pstk'].isnull())] +# choicelist = [data_rawq['ceqq']+data_rawq['pstk'], +# data_rawq['atq']-data_rawq['ltq']] +# data_rawq['scal'] = np.select(condlist, choicelist, default=data_rawq['seqq']) + +# ala +data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ + 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) + +# alm +# data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) + +# rsup +data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4) +# data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'] + +# stdsacc +data_rawq['actq_l1'] = data_rawq.groupby(['permno'])['actq'].shift(1) +data_rawq['cheq_l1'] = data_rawq.groupby(['permno'])['cheq'].shift(1) +data_rawq['lctq_l1'] = data_rawq.groupby(['permno'])['lctq'].shift(1) +data_rawq['dlcq_l1'] = data_rawq.groupby(['permno'])['dlcq'].shift(1) + +data_rawq['sacc'] = ((data_rawq['actq']-data_rawq['actq_l1'] - (data_rawq['cheq']-data_rawq['cheq_l1'])) + -((data_rawq['lctq']-data_rawq['lctq_l1'])-(data_rawq['dlcq']-data_rawq['dlcq_l1'])))/data_rawq['saleq'] +data_rawq['sacc'] = np.where(data_rawq['saleq']<=0, ((data_rawq['actq']-data_rawq['actq_l1'] - (data_rawq['cheq']-data_rawq['cheq_l1'])) + -((data_rawq['lctq']-data_rawq['lctq_l1'])-(data_rawq['dlcq']-data_rawq['dlcq_l1'])))/0.01, data_rawq['sacc']) + + +def chars_std(start, end, df, chars): + """ + + :param start: Order of starting lag + :param end: Order of ending lag + :param df: Dataframe + :param chars: lag chars + :return: std of factor + """ + lag = pd.DataFrame() + lag_list = [] + for i in range(start, end): + lag['chars_l%s' % i] = df.groupby(['permno'])['%s' % chars].shift(i) + lag_list.append('chars_l%s' % i) + result = lag[lag_list].std(axis=1) + return result + +data_rawq['stdacc'] = chars_std(0, 16, data_rawq, 'sacc') + +# sgrvol +# data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') + +# roavol +data_rawq['roavol'] = chars_std(0, 16, data_rawq, 'roa') + +# stdcf +data_rawq['scf'] = (data_rawq['ibq']/data_rawq['saleq']) - data_rawq['sacc'] +data_rawq['scf'] = np.where(data_rawq['saleq']<=0, (data_rawq['ibq']/0.01) - data_rawq['sacc'], data_rawq['sacc']) + +data_rawq['stdcf'] = chars_std(0, 16, data_rawq, 'scf') + +# cinvest +data_rawq['ppentq_l1'] = data_rawq.groupby(['permno'])['ppentq'].shift(1) +data_rawq['ppentq_l2'] = data_rawq.groupby(['permno'])['ppentq'].shift(2) +data_rawq['ppentq_l3'] = data_rawq.groupby(['permno'])['ppentq'].shift(3) +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['saleq_l1'] = data_rawq.groupby(['permno'])['saleq'].shift(1) +data_rawq['saleq_l2'] = data_rawq.groupby(['permno'])['saleq'].shift(2) +data_rawq['saleq_l3'] = data_rawq.groupby(['permno'])['saleq'].shift(3) + +data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / data_rawq['saleq_l1'] +data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / data_rawq['saleq_l2'] +data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / data_rawq['saleq_l3'] + +data_rawq['cinvest'] = ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / data_rawq['saleq'])\ + -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)) + +data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / 0.01 +data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / 0.01 +data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / 0.01 + +data_rawq['cinvest'] = np.where(data_rawq['saleq']<=0, ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / 0.01) + -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)), data_rawq['cinvest']) + +data_rawq = data_rawq.drop(['c_temp1', 'c_temp2', 'c_temp3'], axis=1) + +# nincr +data_rawq['ibq_l1'] = data_rawq.groupby(['permno'])['ibq'].shift(1) +data_rawq['ibq_l2'] = data_rawq.groupby(['permno'])['ibq'].shift(2) +data_rawq['ibq_l3'] = data_rawq.groupby(['permno'])['ibq'].shift(3) +data_rawq['ibq_l4'] = data_rawq.groupby(['permno'])['ibq'].shift(4) +data_rawq['ibq_l5'] = data_rawq.groupby(['permno'])['ibq'].shift(5) +data_rawq['ibq_l6'] = data_rawq.groupby(['permno'])['ibq'].shift(6) +data_rawq['ibq_l7'] = data_rawq.groupby(['permno'])['ibq'].shift(7) +data_rawq['ibq_l8'] = data_rawq.groupby(['permno'])['ibq'].shift(8) + +data_rawq['nincr_temp1'] = np.where(data_rawq['ibq'] > data_rawq['ibq_l1'], 1, 0) +data_rawq['nincr_temp2'] = np.where(data_rawq['ibq_l1'] > data_rawq['ibq_l2'], 1, 0) +data_rawq['nincr_temp3'] = np.where(data_rawq['ibq_l2'] > data_rawq['ibq_l3'], 1, 0) +data_rawq['nincr_temp4'] = np.where(data_rawq['ibq_l3'] > data_rawq['ibq_l4'], 1, 0) +data_rawq['nincr_temp5'] = np.where(data_rawq['ibq_l4'] > data_rawq['ibq_l5'], 1, 0) +data_rawq['nincr_temp6'] = np.where(data_rawq['ibq_l5'] > data_rawq['ibq_l6'], 1, 0) +data_rawq['nincr_temp7'] = np.where(data_rawq['ibq_l6'] > data_rawq['ibq_l7'], 1, 0) +data_rawq['nincr_temp8'] = np.where(data_rawq['ibq_l7'] > data_rawq['ibq_l8'], 1, 0) + +data_rawq['nincr'] = (data_rawq['nincr_temp1'] + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']*data_rawq['nincr_temp8'])) + +data_rawq = data_rawq.drop(['ibq_l1', 'ibq_l2', 'ibq_l3', 'ibq_l4', 'ibq_l5', 'ibq_l6', 'ibq_l7', 'ibq_l8', 'nincr_temp1', + 'nincr_temp2', 'nincr_temp3', 'nincr_temp4', 'nincr_temp5', 'nincr_temp6', 'nincr_temp7', + 'nincr_temp8'], axis=1) + +# performance score +data_rawq['niq4'] = ttm4(series='niq', df=data_rawq) +data_rawq['niq4_l4'] = data_rawq.groupby(['permno'])['niq4'].shift(4) +data_rawq['dlttq_l4'] = data_rawq.groupby(['permno'])['dlttq'].shift(4) +data_rawq['p_temp1'] = np.where(data_rawq['niq4']>0, 1, 0) +data_rawq['p_temp2'] = np.where(data_rawq['oancfy']>0, 1, 0) +data_rawq['p_temp3'] = np.where(data_rawq['niq4']/data_rawq['atq']>data_rawq['niq4_l4']/data_rawq['atq_l4'], 1, 0) +data_rawq['p_temp4'] = np.where(data_rawq['oancfy']>data_rawq['niq4'], 1, 0) +data_rawq['p_temp5'] = np.where(data_rawq['dlttq']/data_rawq['atq'] data_rawq['actq_l4']/data_rawq['lctq_l4'], 1, 0) +data_rawq['cogsq4_l4'] = data_rawq.groupby(['permno'])['cogsq4'].shift(4) +data_rawq['p_temp7'] = np.where((data_rawq['saleq4']-data_rawq['cogsq4']/data_rawq['saleq4'])>(data_rawq['saleq4_l4']-data_rawq['cogsq4_l4']/data_rawq['saleq4_l4']), 1, 0) +data_rawq['p_temp8'] = np.where(data_rawq['saleq4']/data_rawq['atq']>data_rawq['saleq4_l4']/data_rawq['atq_l4'], 1, 0) +data_rawq['p_temp9'] = np.where(data_rawq['scstkcy']==0, 1, 0) + +data_rawq['pscore'] = data_rawq['p_temp1']+data_rawq['p_temp2']+data_rawq['p_temp3']+data_rawq['p_temp4']\ + +data_rawq['p_temp5']+data_rawq['p_temp6']+data_rawq['p_temp7']+data_rawq['p_temp8']\ + +data_rawq['p_temp9'] + +data_rawq = data_rawq.drop(['p_temp1', 'p_temp2', 'p_temp3', 'p_temp4', 'p_temp5', 'p_temp6', 'p_temp7', 'p_temp8', + 'p_temp9'], axis=1) + +################################## Added on 2020.10.29 ################################## +#Iaq +data_rawq['atqlag'] = ttm4('atq',data_rawq) +data_rawq['iaq'] = (data_rawq['atq']/data_rawq['atqlag'])-1 + +#Almq +data_rawq['intanq'] = np.where(data_rawq['intanq'].isnull(), 0, data_rawq['intanq']) +data_rawq['qal'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq']) + 0.5*(data_rawq['atq'] - data_rawq['actq'] - data_rawq['intanq']) +data_rawq['mveqa'] = data_rawq['atq'] + data_rawq['mveq_f'] - data_rawq['ceqq'] +data_rawq['mveqa_1'] = data_rawq.groupby(['permno'])['mveqa'].shift(1) +data_rawq['almq'] = data_rawq['qal']/data_rawq['mveqa_1'] + +#Olq, needs atq +data_rawq['olq'] = (data_rawq['cogsq'] + data_rawq['xsgaq'])/data_rawq['atq'] + +# rds +data_rawq['rds'] = data_rawq['xrdq4']/data_rawq['saleq'] + +print('quarterly variables') +####################################################################################################################### +# Momentum # +####################################################################################################################### +crsp_mom = conn.raw_sql(""" + select permno, date, ret, retx, prc, shrout, vol + from crsp.msf + where date >= '01/01/1959' + """) + +crsp_mom['permno'] = crsp_mom['permno'].astype(int) +crsp_mom['date'] = pd.to_datetime(crsp_mom['date']) +crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) +crsp_mom = crsp_mom.dropna(subset=['ret', 'retx', 'prc']) + +# add delisting return +dlret = conn.raw_sql(""" + select permno, dlret, dlstdt + from crsp.msedelist + """) + +dlret.permno = dlret.permno.astype(int) +dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) +dlret['jdate'] = dlret['dlstdt'] + MonthEnd(0) + +# merge delisting return to crsp return +crsp_mom = pd.merge(crsp_mom, dlret, how='left', on=['permno', 'jdate']) +crsp_mom['dlret'] = crsp_mom['dlret'].fillna(0) +crsp_mom['ret'] = crsp_mom['ret'].fillna(0) +crsp_mom['retadj'] = (1 + crsp_mom['ret']) * (1 + crsp_mom['dlret']) - 1 +crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity +crsp_mom['retx'] = np.where(crsp_mom['me'].isnull(), 0, crsp_mom['retx']) +crsp_mom = crsp_mom.drop(['dlret', 'dlstdt'], axis=1)#delete prc,shrout + +#Seasonality + +#Rla +crsp_mom['rla'] = crsp_mom.groupby(['permno'])['ret'].shift(12) + +#Rln +lag = pd.DataFrame() +result = 0 +for i in range(1, 12): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['rln'] = result/11 + +#R[2,5]a +#R[2,5]n +lag = pd.DataFrame() +result = 0 +for i in range(13,61): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + if i not in [24,36,48,60]: + result = result + lag['mom%s' % i] + +crsp_mom['r25a'] = (lag['mom24']+lag['mom36']+lag['mom48']+lag['mom60'])/4 +crsp_mom['r25n'] = result/44 + +#R[6,10]a +#R[6,10]n +lag = pd.DataFrame() +result = 0 +for i in range(61,121): + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + if i not in [72,84,96,108,120]: + result = result + lag['mom%s' % i] + +crsp_mom['r610a'] = (lag['mom72']+lag['mom84']+lag['mom96']+lag['mom108']+lag['mom120'])/5 +crsp_mom['r610n'] = result/55 + +#R[11,15]a +lag = pd.DataFrame() +result = 0 +for i in [132,144,156,168,180]: + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['r1115a'] = result/5 + +#R[16,20]a +lag = pd.DataFrame() +result = 0 +for i in [192,204,216,228,240]: + lag['mom%s' % i] = crsp_mom.groupby(['permno'])['ret'].shift(i) + result = result + lag['mom%s' % i] +crsp_mom['r1620a'] = result/5 + + +def mom(start, end, df): + """ + :param start: Order of starting lag + :param end: Order of ending lag + :param df: Dataframe + :return: Momentum factor + """ + lag = pd.DataFrame() + result = 1 + for i in range(start, end): + lag['mom%s' % i] = df.groupby(['permno'])['ret'].shift(i) + result = result * (1+lag['mom%s' % i]) + result = result - 1 + return result + + +crsp_mom['mom60m'] = mom(12, 60, crsp_mom) +crsp_mom['mom12m'] = mom(1, 12, crsp_mom) +crsp_mom['mom1m'] = crsp_mom['ret'] +crsp_mom['mom6m'] = mom(1, 6, crsp_mom) +crsp_mom['mom36m'] = mom(1, 36, crsp_mom) +crsp_mom['seas1a'] = crsp_mom.groupby(['permno'])['ret'].shift(11) + +crsp_mom['vol_l1'] = crsp_mom.groupby(['permno'])['vol'].shift(1) +crsp_mom['vol_l2'] = crsp_mom.groupby(['permno'])['vol'].shift(2) +crsp_mom['vol_l3'] = crsp_mom.groupby(['permno'])['vol'].shift(3) +crsp_mom['prc_l2'] = crsp_mom.groupby(['permno'])['prc'].shift(2) +crsp_mom['dolvol'] = np.log(crsp_mom['vol_l2']*crsp_mom['prc_l2']).replace([np.inf, -np.inf], np.nan) +crsp_mom['turn'] = ((crsp_mom['vol_l1']+crsp_mom['vol_l2']+crsp_mom['vol_l3'])/3)/crsp_mom['shrout'] + +# dy +crsp_mom['me_l1'] = crsp_mom.groupby(['permno'])['me'].shift(1) +crsp_mom['retdy'] = crsp_mom['ret'] - crsp_mom['retx'] +crsp_mom['mdivpay'] = crsp_mom['retdy']*crsp_mom['me_l1'] + +crsp_mom['dy'] = ttm12(series='mdivpay', df=crsp_mom)/crsp_mom['me'] + +# def moms(start, end, df): +# """ +# +# :param start: Order of starting lag +# :param end: Order of ending lag +# :param df: Dataframe +# :return: Momentum factor +# """ +# lag = pd.DataFrame() +# result = 1 +# for i in range(start, end): +# lag['moms%s' % i] = df.groupby['permno']['ret'].shift(i) +# result = result + lag['moms%s' % i] +# result = result/11 +# return result +# +# +# crsp_mom['moms12m'] = moms(1, 12, crsp_mom) + +# populate the chars to monthly +print('momentum') +# data_rawa +data_rawa = data_rawa.drop(['date', 'ret', 'retx', 'me'], axis=1) +data_rawa = pd.merge(crsp_mom, data_rawa, how='left', on=['permno', 'jdate']) +data_rawa['datadate'] = data_rawa.groupby(['permno'])['datadate'].fillna(method='ffill') +data_rawa = data_rawa.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') +data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & + ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] +print('data_rawa') +# data_rawq +data_rawq = data_rawq.drop(['date', 'ret', 'retx', 'me'], axis=1) +data_rawq = pd.merge(crsp_mom, data_rawq, how='left', on=['permno', 'jdate']) +data_rawq['datadate'] = data_rawq.groupby(['permno'])['datadate'].fillna(method='ffill') +data_rawq = data_rawq.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') +data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & + ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] +print('data_rawq') +####################################################################################################################### +# Monthly ME # +####################################################################################################################### + +######################################## +# Annual # +######################################## + +# bm +data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] +#data_rawa['bm_n'] = data_rawa['be'] + +# bm_ia +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() +df_temp = df_temp.rename(columns={'bm': 'bm_ind'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +data_rawa['bm_ia'] = data_rawa['bm']/data_rawa['bm_ind'] + +# me_ia +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['me'].mean() +df_temp = df_temp.rename(columns={'me': 'me_ind'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +data_rawa['me_ia'] = data_rawa['me']/data_rawa['me_ind'] + +# cfp +condlist = [data_rawa['dp'].isnull(), + data_rawa['ib'].isnull()] +choicelist = [data_rawa['ib']/data_rawa['me'], + np.nan] +data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) + +# ep, checked from Hou and change 'ME' from compustat to crsp,checked +data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] +#data_rawa['ep_n'] = data_rawa['ib'] + +# rsup +# data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) +data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] + +# lev +data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] + +# sp, checked +data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] +#data_rawa['sp_n'] = data_rawa['sale'] + +# rdm +data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] + +# adm hxz adm,checked +data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] + +# dy +data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] + +# Cp +#data_rawa['cf'] = data_rawa['ib'] + data_rawa['dp'] +data_rawa['cp'] = data_rawa['cf'] / data_rawa['me'] + +# Ebp +#data_rawa['dvpa'] = np.where(data_rawa['dvpa'].isnull(), 0, data_rawa['dvpa']) +#data_rawa['tstkp'] = np.where(data_rawa['tstkp'].isnull(), 0, data_rawa['tstkp']) +#data_rawa['f_liab'] = data_rawa['dltt'] + data_rawa['dlc'] + data_rawa['pstk'] + data_rawa['dvpa'] - data_rawa['tstkp'] +#data_rawa['f_asse'] = data_rawa['che'] +# net debt : = financial liabilities - financial assets. +#data_rawa['n_debt'] = data_rawa['f_liab'] - data_rawa['f_asse'] +#data_rawa['ber'] = data_rawa['ceq'] + data_rawa['tstkp'] - data_rawa['dvpa'] +data_rawa['ebp'] = (data_rawa['n_debt']+data_rawa['ber']) / (data_rawa['n_debt']+data_rawa['me']) + +# Em +data_rawa['enteprs_v'] = data_rawa['me'] + data_rawa['dlc'] + data_rawa['dltt'] + data_rawa['pstkrv'] - data_rawa['che'] +data_rawa['em'] = data_rawa['enteprs_v'] / data_rawa['oibdp'] + +# Cei +data_rawa['lg_me'] = np.log(data_rawa['me']/data_rawa['me'].shift(6)) +data_rawa['lg_ret'] = np.log(data_rawa['ret']*data_rawa['ret'].shift(1)*data_rawa['ret'].shift(2)*data_rawa['ret'].shift(3)*data_rawa['ret'].shift(5)*data_rawa['ret'].shift(6)) +data_rawa['cei'] = data_rawa['lg_me'] - data_rawa['lg_ret'] + +#nop +data_rawa['net_p'] = data_rawa['dvc'] + data_rawa['prstkc'] + 2*data_rawa['pstkrv'] - data_rawa['sstk'] +data_rawa['nop'] = data_rawa['net_p'] / data_rawa['me'] +data_rawa['nop'] = np.where(data_rawa['nop']<=0, np.nan, data_rawa['nop'] ) + +#ocp +data_rawa['ocy'] = np.where(data_rawa['jdate'] < '1988-06-30', data_rawa['fopt'] - data_rawa['wcap'], data_rawa['fopt'] - data_rawa['oancf']) +data_rawa['ocp'] = data_rawa['ocy'] / data_rawa['me'] +data_rawa['ocp'] = np.where(data_rawa['ocp']<=0, np.nan, data_rawa['ocp'] ) + +#bm_t-5 (bm of year t-5) +data_rawa['bm5'] = data_rawa.groupby(['permno'])['bm'].shift(5) + +#rB (five year log book return) +#Reference: jf_06 page8 by KENT DANIEL +data_rawa['rB'] = data_rawa['bm'] - data_rawa['bm5'] + data_rawa['ret5'] + +#Regression and get ir +#First get unique datelist +datelist = data_rawa['jdate'].unique() +for date in datelist: + temp = data_rawa[data_rawa['jdate'] == date] + n_row = temp.shape[0] + index = temp.index + X = pd.DataFrame() + X['bm5'] = temp['bm5'] + X['rB'] = temp['rB'] + X['intercept'] = 1 + X = X[['intercept','rB','bm5']] + X = np.mat(X) + Y = np.mat(temp[['ret5']]) + #These are residuals on one date + res = (np.identity(n_row) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + #put residuals back into data_rawa + data_rawa.loc[index,'ir'] = res + +# Annual Accounting Variables +chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', + 'sic', 'retadj', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', 'rsup', 'cash', 'chcsho', + 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'adm', 'sgr', 'sp', 'invest', 'roe', + 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'chato', 'chtx', 'noa', 'rna', 'pm', 'ato', 'dy', + 'roic', 'chinv', 'pchsale_pchinvt', 'pchsale_pchrect', 'pchgm_pchsale', 'pchsale_pchxsga', + 'pchdepr', 'chadv', 'pchcapx', 'grcapx', 'grGW', 'currat', 'pchcurrat', 'quick', 'pchquick', + 'salecash', 'salerec', 'saleinv', 'pchsaleinv', 'realestate', 'obklg', 'chobklg', 'grltnoa', + 'conv', 'chdrc', 'rdbias', 'operprof', 'capxint', 'xadint', 'chpm', 'ala', 'alm', + 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'hire', 'herf', 'bm_ia', + 'me_ia', 'bmj','cp', 'ebp', 'em', 'dp', 'aci', 'dpia', 'dBe', 'dfnl', 'dfin', 'dcoa', + 'dlno', 'dnoa', 'cla', 'cop', 'cto', 'dIi', 'dnco', 'dnca', 'ir', 'nop', 'ocp', + 'ia', 'ig','2ig','ivc','ndf','nsi','oa','poa','ta','ol','etr']] + +chars_a.reset_index(drop=True, inplace=True) +print(chars_a) +print('ME annual') +######################################## +# Quarterly # +######################################## +# bm +data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] + +# cfp +data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), + data_rawq['ibq4']/data_rawq['me'], + (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) + +# ep +data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] + +# lev +data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] + +# rdm +data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] + +# sp +data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] + +# alm +data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) + +# rsup +# data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4) +data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'] + +# sgrvol +data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') + +# Quarterly Accounting Variables +chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd','retadj' ,'acc', 'bm', 'cfp', + 'ep', 'agr', 'ni', 'op', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', + 'rdm', 'sgr', 'sp', 'invest', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'roe', + 'chato', 'chpm', 'chtx', 'noa', 'rna', 'pm', 'ato', 'stdcf', + 'grltnoa', 'ala', 'alm', 'rsup', 'stdacc', 'sgrvol', 'roavol', 'scf', 'cinvest', + 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'pscore', 'nincr', + 'turn', 'dolvol', 'iaq', 'almq', 'olq', 'rds']] + +chars_q.reset_index(drop=True, inplace=True) +print(chars_q) +print('ME quarterly') +with open('chars_a_60.pkl', 'wb') as f: + pkl.dump(chars_a, f) +print('pkl a') +with open('chars_q_60.pkl', 'wb') as f: + pkl.dump(chars_q, f) +print('pkl q') +print('Finished') \ No newline at end of file diff --git a/char60/accounting_60.py b/char60/accounting_60.py new file mode 100755 index 0000000..d32b43c --- /dev/null +++ b/char60/accounting_60.py @@ -0,0 +1,1215 @@ +import pandas as pd +import numpy as np +import wrds +from pandas.tseries.offsets import * +import pickle as pkl +from functions import * + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +####################################################################################################################### +# TTM functions # +####################################################################################################################### + + +def ttm4(series, df): + """ + + :param series: variables' name + :param df: dataframe + :return: ttm4 + """ + lag = pd.DataFrame() + for i in range(1, 4): + lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('permno')['%s' % series].shift(i) + result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] + return result + + +def ttm12(series, df): + """ + + :param series: variables' name + :param df: dataframe + :return: ttm12 + """ + lag = pd.DataFrame() + for i in range(1, 12): + lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('permno')['%s' % series].shift(i) + result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] +\ + lag['%s4' % series] + lag['%s5' % series] + lag['%s6' % series] + lag['%s7' % series] +\ + lag['%s8' % series] + lag['%s9' % series] + lag['%s10' % series] + lag['%s11' % series] + return result + + +####################################################################################################################### +# Compustat Block # +####################################################################################################################### +comp = conn.raw_sql(""" + /*header info*/ + select c.gvkey, f.cusip, f.datadate, f.fyear, c.cik, substr(c.sic,1,2) as sic2, c.sic, c.naics, + + /*firm variables*/ + /*income statement*/ + f.sale, f.revt, f.cogs, f.xsga, f.dp, f.xrd, f.xad, f.ib, f.ebitda, + f.ebit, f.nopi, f.spi, f.pi, f.txp, f.ni, f.txfed, f.txfo, f.txt, f.xint, + + /*CF statement and others*/ + f.capx, f.oancf, f.dvt, f.ob, f.gdwlia, f.gdwlip, f.gwo, f.mib, f.oiadp, f.ivao, + + /*assets*/ + f.rect, f.act, f.che, f.ppegt, f.invt, f.at, f.aco, f.intan, f.ao, f.ppent, f.gdwl, f.fatb, f.fatl, + + /*liabilities*/ + f.lct, f.dlc, f.dltt, f.lt, f.dm, f.dcvt, f.cshrc, + f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, + + /*equity and other*/ + f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, f.dpc, f.ajex, + + /*market*/ + abs(f.prcc_f) as prcc_f + + from comp.funda as f + left join comp.company as c + on f.gvkey = c.gvkey + + /*get consolidated, standardized, industrial format statements*/ + where f.indfmt = 'INDL' + and f.datafmt = 'STD' + and f.popsrc = 'D' + and f.consol = 'C' + and f.datadate >= '01/01/1959' + """) + +# convert datadate to date fmt +comp['datadate'] = pd.to_datetime(comp['datadate']) + +# sort and clean up +comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() + +# clean up csho +comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) + +# calculate Compustat market equity +comp['mve_f'] = comp['csho'] * comp['prcc_f'] + +# do some clean up. several variables have lots of missing values +condlist = [comp['drc'].notna() & comp['drlt'].notna(), + comp['drc'].notna() & comp['drlt'].isnull(), + comp['drlt'].notna() & comp['drc'].isnull()] +choicelist = [comp['drc']+comp['drlt'], + comp['drc'], + comp['drlt']] +comp['dr'] = np.select(condlist, choicelist, default=np.nan) + +condlist = [comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].notna() & comp['dcpstk'] > comp['pstk'], + comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].isnull()] +choicelist = [comp['dcpstk']-comp['pstk'], + comp['dcpstk']] +comp['dc'] = np.select(condlist, choicelist, default=np.nan) +comp['dc'] = np.where(comp['dc'].isnull(), comp['dcvt'], comp['dc']) + +comp['xint0'] = np.where(comp['xint'].isnull(), 0, comp['xint']) +comp['xsga0'] = np.where(comp['xsga'].isnull, 0, 0) + +comp['ceq'] = np.where(comp['ceq'] == 0, np.nan, comp['ceq']) +comp['at'] = np.where(comp['at'] == 0, np.nan, comp['at']) +comp = comp.dropna(subset=['at']) + +####################################################################################################################### +# CRSP Block # +####################################################################################################################### +# Create a CRSP Subsample with Monthly Stock and Event Variables +# Restrictions will be applied later +# Select variables from the CRSP monthly stock and event datasets +crsp = conn.raw_sql(""" + select a.prc, a.ret, a.retx, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.date, a.permno, a.permco, + b.ticker, b.ncusip, b.shrcd, b.exchcd + from crsp.msf as a + left join crsp.msenames as b + on a.permno=b.permno + and b.namedt<=a.date + and a.date<=b.nameendt + where a.date >= '01/01/1959' + and b.exchcd between 1 and 3 + """) + +# change variable format to int +crsp[['permco', 'permno', 'shrcd', 'exchcd']] = crsp[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) +crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month + +crsp = crsp.dropna(subset=['prc']) +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity + +# if Market Equity is Nan then let return equals to 0 +crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) +crsp['retx'] = np.where(crsp['me'].isnull(), 0, crsp['retx']) + +# impute me +crsp = crsp.sort_values(by=['permno', 'date']).drop_duplicates() +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) + +# Aggregate Market Cap +''' +There are cases when the same firm (permco) has two or more securities (permno) at same date. +For the purpose of ME for the firm, we aggregated all ME for a given permco, date. +This aggregated ME will be assigned to the permno with the largest ME. +''' +# sum of me across different permno belonging to same permco a given date +crsp_summe = crsp.groupby(['monthend', 'permco'])['me'].sum().reset_index() +# largest mktcap within a permco/date +crsp_maxme = crsp.groupby(['monthend', 'permco'])['me'].max().reset_index() +# join by monthend/maxme to find the permno +crsp1 = pd.merge(crsp, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) +# drop me column and replace with the sum me +crsp1 = crsp1.drop(['me'], axis=1) +# join with sum of me to get the correct market cap info +crsp2 = pd.merge(crsp1, crsp_summe, how='inner', on=['monthend', 'permco']) +# sort by permno and date and also drop duplicates +crsp2 = crsp2.sort_values(by=['permno', 'monthend']).drop_duplicates() + +####################################################################################################################### +# CCM Block # +####################################################################################################################### +# merge CRSP and Compustat +# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/linking-databases/linking-crsp-and-compustat/ +ccm = conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where substr(linktype,1,1)='L' + and (linkprim ='C' or linkprim='P') + """) + +ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) +ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) + +# if linkenddt is missing then set to today date +ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) + +# merge ccm and comp +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) + +# we can only get the accounting data after the firm public their report +# for annual data, we use 4, 5 or 6 months lagged data +ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) +ccm1['jdate'] = ccm1['datadate'] + MonthEnd(4) + +# set link date bounds +ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] + +# link comp and crsp +crsp2 = crsp2.rename(columns={'monthend': 'jdate'}) +data_rawa = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) + +# filter exchcd & shrcd +data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & + ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] + +# process Market Equity +''' +Note: me is CRSP market equity, mve_f is Compustat market equity. Please choose the me below. +''' +data_rawa['me'] = data_rawa['me']/1000 # CRSP ME +# data_rawa['me'] = data_rawa['mve_f'] # Compustat ME + +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) +data_rawa = data_rawa.dropna(subset=['me']) + +# count single stock years +# data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() + +# deal with the duplicates +data_rawa.loc[data_rawa.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 +data_rawa = data_rawa[data_rawa['temp'].notna()] +data_rawa.loc[data_rawa.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 +data_rawa = data_rawa[data_rawa['temp'].notna()] + +data_rawa = data_rawa.sort_values(by=['permno', 'jdate']) + +####################################################################################################################### +# Annual Variables # +####################################################################################################################### +# preferrerd stock +data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps']) + +data_rawa['txditc'] = data_rawa['txditc'].fillna(0) + +# book equity +data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] +data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) + +# acc +data_rawa['act_l1'] = data_rawa.groupby(['permno'])['act'].shift(1) +data_rawa['lct_l1'] = data_rawa.groupby(['permno'])['lct'].shift(1) + +condlist = [data_rawa['np'].isnull(), + data_rawa['act'].isnull() | data_rawa['lct'].isnull()] +choicelist = [((data_rawa['act']-data_rawa['lct'])-(data_rawa['act_l1']-data_rawa['lct_l1'])/(10*data_rawa['be'])), + (data_rawa['ib']-data_rawa['oancf'])/(10*data_rawa['be'])] +data_rawa['acc'] = np.select(condlist, + choicelist, + default=((data_rawa['act']-data_rawa['lct']+data_rawa['np'])- + (data_rawa['act_l1']-data_rawa['lct_l1']+data_rawa['np'].shift(1)))/(10*data_rawa['be'])) + +# agr +data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1) +data_rawa['agr'] = (data_rawa['at']-data_rawa['at_l1'])/data_rawa['at_l1'] + +# bm +# data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] + +# cfp +# condlist = [data_rawa['dp'].isnull(), +# data_rawa['ib'].isnull()] +# choicelist = [data_rawa['ib']/data_rawa['me'], +# np.nan] +# data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) + +# ep +# data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] + +# ni +data_rawa['csho_l1'] = data_rawa.groupby(['permno'])['csho'].shift(1) +data_rawa['ajex_l1'] = data_rawa.groupby(['permno'])['ajex'].shift(1) +data_rawa['ni'] = np.where(data_rawa['gvkey'] != data_rawa['gvkey'].shift(1), + np.nan, + np.log(data_rawa['csho']*data_rawa['ajex']).replace(-np.inf, 0)- + np.log(data_rawa['csho_l1']*data_rawa['ajex_l1']).replace(-np.inf, 0)) + +# op +data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) +data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) +data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga']) + +condlist = [data_rawa['revt'].isnull(), data_rawa['be'].isnull()] +choicelist = [np.nan, np.nan] +data_rawa['op'] = np.select(condlist, choicelist, + default=(data_rawa['revt'] - data_rawa['cogs0'] - data_rawa['xsga0'] - data_rawa['xint0'])/data_rawa['be']) + +# rsup +data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) +# data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] + +# cash +data_rawa['cash'] = data_rawa['che']/data_rawa['at'] + +# lev +# data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] + +# sp +# data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] + +# rd_sale +data_rawa['rd_sale'] = data_rawa['xrd']/data_rawa['sale'] + +# rdm +# data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] + +# adm hxz adm +# data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] + +# gma +data_rawa['gma'] = (data_rawa['revt']-data_rawa['cogs'])/data_rawa['at_l1'] + +# chcsho +data_rawa['chcsho'] = (data_rawa['csho']/data_rawa['csho_l1'])-1 + +# lgr +data_rawa['lt_l1'] = data_rawa.groupby(['permno'])['lt'].shift(1) +data_rawa['lgr'] = (data_rawa['lt']/data_rawa['lt_l1'])-1 + +# pctacc +data_rawa['che_l1'] = data_rawa.groupby(['permno'])['che'].shift(1) +data_rawa['dlc_l1'] = data_rawa.groupby(['permno'])['dlc'].shift(1) +data_rawa['txp_l1'] = data_rawa.groupby(['permno'])['txp'].shift(1) + +condlist = [data_rawa['ib']==0, + data_rawa['oancf'].isnull(), + data_rawa['oancf'].isnull() & data_rawa['ib']==0] +choicelist = [(data_rawa['ib']-data_rawa['oancf'])/0.01, + ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1']))- + ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1']- + ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))/data_rawa['ib'].abs(), + ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1'])) - + ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1'] - + ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))] +data_rawa['pctacc'] = np.select(condlist, choicelist, default=(data_rawa['ib']-data_rawa['oancf'])/data_rawa['ib'].abs()) + +# sgr +data_rawa['sgr'] = (data_rawa['sale']/data_rawa['sale_l1'])-1 + +# chato +data_rawa['at_l2'] = data_rawa.groupby(['permno'])['at'].shift(2) +data_rawa['chato'] = (data_rawa['sale']/((data_rawa['at']+data_rawa['at_l1'])/2))-\ + (data_rawa['sale_l1']/((data_rawa['at']+data_rawa['at_l2'])/2)) + +# chtx +data_rawa['txt_l1'] = data_rawa.groupby(['permno'])['txt'].shift(1) +data_rawa['chtx'] = (data_rawa['txt']-data_rawa['txt_l1'])/data_rawa['at_l1'] + +# noa +data_rawa['noa'] = ((data_rawa['at']-data_rawa['che']-data_rawa['ivao'].fillna(0))- + (data_rawa['at']-data_rawa['dlc'].fillna(0)-data_rawa['dltt'].fillna(0)-data_rawa['mib'].fillna(0) + -data_rawa['pstk'].fillna(0)-data_rawa['ceq'])/data_rawa['at_l1']) + +# rna +data_rawa['noa_l1'] = data_rawa.groupby(['permno'])['noa'].shift(1) +data_rawa['rna'] = data_rawa['oiadp']/data_rawa['noa_l1'] + +# pm +data_rawa['pm'] = data_rawa['oiadp']/data_rawa['sale'] + +# ato +data_rawa['ato'] = data_rawa['sale']/data_rawa['noa_l1'] + +# depr +data_rawa['depr'] = data_rawa['dp']/data_rawa['ppent'] + +# invest +data_rawa['ppent_l1'] = data_rawa.groupby(['permno'])['ppent'].shift(1) +data_rawa['invt_l1'] = data_rawa.groupby(['permno'])['invt'].shift(1) + +data_rawa['invest'] = np.where(data_rawa['ppegt'].isnull(), ((data_rawa['ppent']-data_rawa['ppent_l1'])+ + (data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1'], + ((data_rawa['ppegt']-data_rawa['ppent_l1'])+(data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1']) + +# egr +data_rawa['ceq_l1'] = data_rawa.groupby(['permno'])['ceq'].shift(1) +data_rawa['egr'] = ((data_rawa['ceq']-data_rawa['ceq_l1'])/data_rawa['ceq_l1']) + +# cashdebt +data_rawa['cashdebt'] = (data_rawa['ib']+data_rawa['dp'])/((data_rawa['lt']+data_rawa['lt_l1'])/2) + +# rd +# if ((xrd/at)-(lag(xrd/lag(at))))/(lag(xrd/lag(at))) >.05 then rd=1 else rd=0 +data_rawa['xrd/at_l1'] = data_rawa['xrd']/data_rawa['at_l1'] +data_rawa['xrd/at_l1_l1'] = data_rawa.groupby(['permno'])['xrd/at_l1'].shift(1) +data_rawa['rd'] = np.where(((data_rawa['xrd']/data_rawa['at'])- + (data_rawa['xrd/at_l1_l1']))/data_rawa['xrd/at_l1_l1']>0.05, 1, 0) + +# roa +data_rawa['roa'] = data_rawa['ni']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# roe +data_rawa['roe'] = data_rawa['ib']/data_rawa['ceq_l1'] + +# dy +# data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] + +################## Added on 2020.07.28 ################## + +# roic +data_rawa['roic'] = (data_rawa['ebit'] - data_rawa['nopi'])/(data_rawa['ceq'] + data_rawa['lt'] - data_rawa['che']) + +# chinv +data_rawa['chinv'] = (data_rawa['invt'] - data_rawa['invt_l1'])/((data_rawa['at'] + data_rawa['at_l2'])/2) + +# pchsale_pchinvt +data_rawa['pchsale_pchinvt'] = ((data_rawa['sale'] - data_rawa['sale_l1'])/data_rawa['sale_l1'])\ + - ((data_rawa['invt']-data_rawa['invt_l1'])/data_rawa['invt_l1']) + +# pchsale_pchrect +data_rawa['rect_l1'] = data_rawa.groupby(['permno'])['rect'].shift(1) +data_rawa['pchsale_pchrect'] = ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale_l1'])\ + - ((data_rawa['rect']-data_rawa['rect_l1'])/data_rawa['rect_l1']) + +# pchgm_pchsale +data_rawa['cogs_l1'] = data_rawa.groupby(['permno'])['cogs'].shift(1) +data_rawa['pchgm_pchsale'] = (((data_rawa['sale']-data_rawa['cogs']) + - (data_rawa['sale_l1']-data_rawa['cogs_l1']))/(data_rawa['sale_l1']-data_rawa['cogs_l1']))\ + - ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale']) + +# pchsale_pchxsga +data_rawa['xsga_l1'] = data_rawa.groupby(['permno'])['xsga'].shift(1) +data_rawa['pchsale_pchxsga'] = ((data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['sale_l1'])\ + - ((data_rawa['xsga']-data_rawa['xsga_l1'])/data_rawa['xsga_l1']) + +# pchdepr +data_rawa['dp_l1'] = data_rawa.groupby(['permno'])['dp'].shift(1) +data_rawa['pchdepr'] = ((data_rawa['dp']/data_rawa['ppent'])-(data_rawa['dp_l1'] + /data_rawa['ppent_l1']))\ + / (data_rawa['dp_l1']/data_rawa['ppent']) + +# chadv +data_rawa['xad_l1'] = data_rawa.groupby(['permno'])['xad'].shift(1) +data_rawa['chadv'] = np.log(data_rawa['xad'] + 1) - np.log(data_rawa['xad_l1'] + 1) + +# pchcapx +data_rawa['capx_l1'] = data_rawa.groupby(['permno'])['capx'].shift(1) +data_rawa['pchcapx'] = (data_rawa['capx']-data_rawa['capx_l1'])/data_rawa['capx_l1'] + +# grcapx +data_rawa['capx_l2'] = data_rawa.groupby(['permno'])['capx'].shift(2) +data_rawa['grcapx'] = (data_rawa['capx']-data_rawa['capx_l2'])/data_rawa['capx_l2'] + +# grGW +data_rawa['gdwl_l1'] = data_rawa.groupby(['permno'])['gdwl'].shift(1) +data_rawa['grGW'] = (data_rawa['gdwl']-data_rawa['gdwl_l1'])/data_rawa['gdwl'] +condlist = [(data_rawa['gdwl']==0) | (data_rawa['gdwl'].isnull()), + (data_rawa['gdwl'].notna()) & (data_rawa['gdwl'] != 0) & (data_rawa['grGW'].isnull())] +choicelist = [0, 1] +data_rawa['grGW'] = np.select(condlist, choicelist, default=data_rawa['grGW']) + +# currat +data_rawa['currat'] = data_rawa['act']/data_rawa['lct'] + +# pchcurrat +data_rawa['pchcurrat'] = ((data_rawa['act']/data_rawa['lct'])-(data_rawa['act_l1']/data_rawa['lct_l1']))\ + /(data_rawa['act_l1']/data_rawa['lct_l1']) + +# quick +data_rawa['quick'] = (data_rawa['act']-data_rawa['invt'])/data_rawa['lct'] + +# pchquick +data_rawa['pchquick'] = ((data_rawa['act']-data_rawa['invt'])/data_rawa['lct'] + -(data_rawa['act_l1']-data_rawa['invt_l1'])/data_rawa['lct_l1'])\ + /((data_rawa['act_l1']-data_rawa['invt_l1'])/data_rawa['lct_l1']) + +# salecash +data_rawa['salecash'] = data_rawa['sale']/data_rawa['che'] + +# salerec +data_rawa['salerec']= data_rawa['sale']/data_rawa['rect'] + +# saleinv +data_rawa['saleinv'] = data_rawa['sale']/data_rawa['invt'] + +# pchsaleinv +data_rawa['pchsaleinv'] = ((data_rawa['sale']/data_rawa['invt'])-(data_rawa['sale_l1']/data_rawa['invt_l1']))\ + /(data_rawa['sale_l1']/data_rawa['invt_l1']) + +# realestate +data_rawa['realestate'] = (data_rawa['fatb']+data_rawa['fatl'])/data_rawa['ppegt'] +data_rawa['realestate'] = np.where(data_rawa['ppegt'].isnull(), + (data_rawa['fatb']+data_rawa['fatl'])/data_rawa['ppent'], data_rawa['realestate']) + +# obklg +data_rawa['obklg'] = data_rawa['ob']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# chobklg +data_rawa['ob_l1'] = data_rawa.groupby(['permno'])['ob'].shift(1) +data_rawa['chobklg'] = (data_rawa['ob'] - data_rawa['ob_l1'])/((data_rawa['at']+data_rawa['at_l1'])/2) + +# grltnoa +data_rawa['aco_l1'] = data_rawa.groupby(['permno'])['aco'].shift(1) +data_rawa['intan_l1'] = data_rawa.groupby(['permno'])['intan'].shift(1) +data_rawa['ao_l1'] = data_rawa.groupby(['permno'])['ao'].shift(1) +data_rawa['ap_l1'] = data_rawa.groupby(['permno'])['ap'].shift(1) +data_rawa['lco_l1'] = data_rawa.groupby(['permno'])['lco'].shift(1) +data_rawa['lo_l1'] = data_rawa.groupby(['permno'])['lo'].shift(1) +data_rawa['rect_l1'] = data_rawa.groupby(['permno'])['rect'].shift(1) + +data_rawa['grltnoa'] = ((data_rawa['rect']+data_rawa['invt']+data_rawa['ppent']+data_rawa['aco']+data_rawa['intan']+ + data_rawa['ao']-data_rawa['ap']-data_rawa['lco']-data_rawa['lo']) + -(data_rawa['rect_l1']+data_rawa['invt_l1']+data_rawa['ppent_l1']+data_rawa['aco_l1'] + +data_rawa['intan_l1']+data_rawa['ao_l1']-data_rawa['ap_l1']-data_rawa['lco_l1'] + -data_rawa['lo_l1']) + -(data_rawa['rect']-data_rawa['rect_l1']+data_rawa['invt']-data_rawa['invt_l1'] + +data_rawa['aco']-data_rawa['aco_l1'] + -(data_rawa['ap']-data_rawa['ap_l1']+data_rawa['lco']-data_rawa['lco_l1'])-data_rawa['dp']))\ + /((data_rawa['at']+data_rawa['at_l1'])/2) + +# conv +data_rawa['conv'] = data_rawa['dc']/data_rawa['dltt'] + +# chdrc +data_rawa['dr_l1'] = data_rawa.groupby(['permno'])['dr'].shift(1) +data_rawa['chdrc'] = (data_rawa['dr']-data_rawa['dr_l1'])/((data_rawa['at']+data_rawa['at_l1'])/2) + +# rdbias +data_rawa['xrd_l1'] = data_rawa.groupby(['permno'])['xrd'].shift(1) +data_rawa['rdbias'] = (data_rawa['xrd']/data_rawa['xrd_l1'])-1-data_rawa['ib']/data_rawa['ceq_l1'] + +# operprof +data_rawa['operprof'] = (data_rawa['revt']-data_rawa['cogs']-data_rawa['xsga0']-data_rawa['xint0'])/data_rawa['ceq_l1'] + +# cfroa +data_rawa['cfroa'] = data_rawa['oancf']/((data_rawa['at']+data_rawa['at_l1'])/2) +data_rawa['cfroa'] = np.where(data_rawa['oancf'].isnull(), + (data_rawa['ib'] + data_rawa['dp'])/((data_rawa['at']+data_rawa['at_l1'])/2), + data_rawa['cfroa']) + +# xrdint +data_rawa['xrdint'] = data_rawa['xrd']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# capxint +data_rawa['capxint'] = data_rawa['capx']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# xadint +data_rawa['xadint'] = data_rawa['xad']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# chpm +data_rawa['ib_l1'] = data_rawa.groupby(['permno'])['ib'].shift(1) +data_rawa['chpm'] = (data_rawa['ib']/data_rawa['sale'])-(data_rawa['ib_l1']/data_rawa['sale_l1']) + +# ala +data_rawa['gdwl'] = np.where(data_rawa['gdwl'].isnull(), 0, data_rawa['gdwl']) +data_rawa['intan'] = np.where(data_rawa['intan'].isnull(), 0, data_rawa['intan']) +data_rawa['ala'] = data_rawa['che']+0.75*(data_rawa['act']-data_rawa['che'])-\ + 0.5*(data_rawa['at']-data_rawa['act']-data_rawa['gdwl']-data_rawa['intan']) + +# alm +data_rawa['alm'] = data_rawa['ala']/(data_rawa['at']+data_rawa['prcc_f']*data_rawa['csho']-data_rawa['ceq']) + +# hire +data_rawa['emp_l1'] = data_rawa.groupby(['permno'])['emp'].shift(1) +data_rawa['hire'] = (data_rawa['emp'] - data_rawa['emp_l1'])/data_rawa['emp_l1'] +data_rawa['hire'] = np.where((data_rawa['emp'].isnull()) | (data_rawa['emp_l1'].isnull()), 0, data_rawa['hire']) + +# herf +data_rawa['sic'] = data_rawa['sic'].astype(int) +data_rawa['ffi49'] = ffi49(data_rawa) +data_rawa['ffi49'] = data_rawa['ffi49'].fillna(49) +data_rawa['ffi49'] = data_rawa['ffi49'].astype(int) +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['sale'].sum() +df_temp = df_temp.rename(columns={'sale': 'indsale'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +data_rawa['herf'] = (data_rawa['sale']/data_rawa['indsale'])*(data_rawa['sale']/data_rawa['indsale']) +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['herf'].sum() +data_rawa = data_rawa.drop(['herf'], axis=1) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) + +####################################################################################################################### +# Compustat Quarterly Raw Info # +####################################################################################################################### +comp = conn.raw_sql(""" + /*header info*/ + select c.gvkey, f.cusip, f.datadate, f.fyearq, substr(c.sic,1,2) as sic2, c.sic, f.fqtr, f.rdq, + + /*income statement*/ + f.ibq, f.saleq, f.txtq, f.revtq, f.cogsq, f.xsgaq, f.revty, f.cogsy, f.saley, + + /*balance sheet items*/ + f.atq, f.actq, f.cheq, f.lctq, f.dlcq, f.ppentq, f.ppegtq, + + /*others*/ + abs(f.prccq) as prccq, abs(f.prccq)*f.cshoq as mveq_f, f.ceqq, f.seqq, f.pstkq, f.ltq, + f.pstkrq, f.gdwlq, f.intanq, f.mibq, f.oiadpq, f.ivaoq, + + /* v3 my formula add*/ + f.ajexq, f.cshoq, f.txditcq, f.npq, f.xrdy, f.xrdq, f.dpq, f.xintq, f.invtq, f.scstkcy, f.niq, + f.oancfy, f.dlttq, f.rectq, f.acoq, f.apq, f.lcoq, f.loq, f.aoq + + from comp.fundq as f + left join comp.company as c + on f.gvkey = c.gvkey + + /*get consolidated, standardized, industrial format statements*/ + where f.indfmt = 'INDL' + and f.datafmt = 'STD' + and f.popsrc = 'D' + and f.consol = 'C' + and f.datadate >= '01/01/1959' + """) + +# comp['cusip6'] = comp['cusip'].str.strip().str[0:6] +comp = comp.dropna(subset=['ibq']) + +# sort and clean up +comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() +comp['cshoq'] = np.where(comp['cshoq'] == 0, np.nan, comp['cshoq']) +comp['ceqq'] = np.where(comp['ceqq'] == 0, np.nan, comp['ceqq']) +comp['atq'] = np.where(comp['atq'] == 0, np.nan, comp['atq']) +comp = comp.dropna(subset=['atq']) + +# convert datadate to date fmt +comp['datadate'] = pd.to_datetime(comp['datadate']) + +# merge ccm and comp +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) +ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) +ccm1['jdate'] = ccm1['datadate'] + MonthEnd(3) # we change quarterly lag here +# ccm1['jdate'] = ccm1['datadate']+MonthEnd(4) + +# set link date bounds +ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] + +# merge ccm2 and crsp2 +# crsp2['jdate'] = crsp2['monthend'] +data_rawq = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) + +# filter exchcd & shrcd +data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & + ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] + +# process Market Equity +''' +Note: me is CRSP market equity, mveq_f is Compustat market equity. Please choose the me below. +''' +data_rawq['me'] = data_rawq['me']/1000 # CRSP ME +# data_rawq['me'] = data_rawq['mveq_f'] # Compustat ME + +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) +data_rawq = data_rawq.dropna(subset=['me']) + +# count single stock years +# data_rawq['count'] = data_rawq.groupby(['gvkey']).cumcount() + +# deal with the duplicates +data_rawq.loc[data_rawq.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 +data_rawq = data_rawq[data_rawq['temp'].notna()] +data_rawq.loc[data_rawq.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 +data_rawq = data_rawq[data_rawq['temp'].notna()] + +data_rawq = data_rawq.sort_values(by=['permno', 'jdate']) + +####################################################################################################################### +# Quarterly Variables # +####################################################################################################################### +# prepare be +data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) +data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) + +# dy +# data_rawq['me_l1'] = data_rawq.groupby(['permno'])['me'].shift(1) +# data_rawq['retdy'] = data_rawq['ret'] - data_rawq['retx'] +# data_rawq['mdivpay'] = data_rawq['retdy']*data_rawq['me_l1'] +# +# data_rawq['dy'] = ttm12(series='mdivpay', df=data_rawq)/data_rawq['me'] + +# chtx +data_rawq['txtq_l4'] = data_rawq.groupby(['permno'])['txtq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) +data_rawq['chtx'] = (data_rawq['txtq']-data_rawq['txtq_l4'])/data_rawq['atq_l4'] + +# roa +data_rawq['atq_l1'] = data_rawq.groupby(['permno'])['atq'].shift(1) +data_rawq['roa'] = data_rawq['ibq']/data_rawq['atq_l1'] + +# cash +data_rawq['cash'] = data_rawq['cheq']/data_rawq['atq'] + +# acc +data_rawq['actq_l4'] = data_rawq.groupby(['permno'])['actq'].shift(4) +data_rawq['lctq_l4'] = data_rawq.groupby(['permno'])['lctq'].shift(4) +data_rawq['npq_l4'] = data_rawq.groupby(['permno'])['npq'].shift(4) +condlist = [data_rawq['npq'].isnull(), + data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] +choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/(10*data_rawq['beq']), + np.nan] +data_rawq['acc'] = np.select(condlist, choicelist, + default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])- + (data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/(10*data_rawq['beq'])) + +# bm +# data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] + +# cfp +data_rawq['ibq4'] = ttm4('ibq', data_rawq) +data_rawq['dpq4'] = ttm4('dpq', data_rawq) +# data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), +# data_rawq['ibq4']/data_rawq['me'], +# (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) + +# ep +# data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] + +# agr +data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'] + +# ni +data_rawq['cshoq_l4'] = data_rawq.groupby(['permno'])['cshoq'].shift(4) +data_rawq['ajexq_l4'] = data_rawq.groupby(['permno'])['ajexq'].shift(4) +data_rawq['ni'] = np.where(data_rawq['cshoq'].isnull(), np.nan, + np.log(data_rawq['cshoq']*data_rawq['ajexq']).replace(-np.inf, 0)-np.log(data_rawq['cshoq_l4']*data_rawq['ajexq_l4'])) + +# op +data_rawq['xintq0'] = np.where(data_rawq['xintq'].isnull(), 0, data_rawq['xintq']) +data_rawq['xsgaq0'] = np.where(data_rawq['xsgaq'].isnull(), 0, data_rawq['xsgaq']) +data_rawq['beq_l4'] = data_rawq.groupby(['permno'])['beq'].shift(4) + +data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] + +# chcsho +data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1 + +# cashdebt +data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4) +data_rawq['cashdebt'] = (ttm4('ibq', data_rawq) + ttm4('dpq', data_rawq))/((data_rawq['ltq']+data_rawq['ltq_l4'])/2) + +# rd +data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) +data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4']) + +data_rawq['xrdq4/atq_l4'] = data_rawq['xrdq4']/data_rawq['atq_l4'] +data_rawq['xrdq4/atq_l4_l4'] = data_rawq.groupby(['permno'])['xrdq4/atq_l4'].shift(4) +data_rawq['rd'] = np.where(((data_rawq['xrdq4']/data_rawq['atq'])-data_rawq['xrdq4/atq_l4_l4'])/data_rawq['xrdq4/atq_l4_l4']>0.05, 1, 0) + +# pctacc +condlist = [data_rawq['npq'].isnull(), + data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] +choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/abs(ttm4('ibq', data_rawq)), np.nan] +data_rawq['pctacc'] = np.select(condlist, choicelist, + default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/ + abs(ttm4('ibq', data_rawq))) + +# gma +data_rawq['revtq4'] = ttm4('revtq', data_rawq) +data_rawq['cogsq4'] = ttm4('cogsq', data_rawq) +data_rawq['gma'] = (data_rawq['revtq4']-data_rawq['cogsq4'])/data_rawq['atq_l4'] + +# lev +# data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] + +# rdm +# data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] + +# sgr +data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) + +data_rawq['saleq4_l4'] = data_rawq.groupby(['permno'])['saleq4'].shift(4) +data_rawq['sgr'] = (data_rawq['saleq4']/data_rawq['saleq4_l4'])-1 + +# sp +# data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] + +# invest +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) +data_rawq['ppegtq_l4'] = data_rawq.groupby(['permno'])['ppegtq'].shift(4) + +data_rawq['invest'] = np.where(data_rawq['ppegtq'].isnull(), ((data_rawq['ppentq']-data_rawq['ppentq_l4'])+ + (data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4'], + ((data_rawq['ppegtq']-data_rawq['ppegtq_l4'])+(data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4']) + +# rd_sale +data_rawq['rd_sale'] = data_rawq['xrdq4']/data_rawq['saleq4'] + +# lgr +data_rawq['lgr'] = (data_rawq['ltq']/data_rawq['ltq_l4'])-1 + +# depr +data_rawq['depr'] = ttm4('dpq', data_rawq)/data_rawq['ppentq'] + +# egr +data_rawq['ceqq_l4'] = data_rawq.groupby(['permno'])['ceqq'].shift(4) +data_rawq['egr'] = (data_rawq['ceqq']-data_rawq['ceqq_l4'])/data_rawq['ceqq_l4'] + +# chpm +data_rawq['ibq4_l1'] = data_rawq.groupby(['permno'])['ibq4'].shift(1) +data_rawq['saleq4_l1'] = data_rawq.groupby(['permno'])['saleq4'].shift(1) + +data_rawq['chpm'] = (data_rawq['ibq4']/data_rawq['saleq4'])-(data_rawq['ibq4_l1']/data_rawq['saleq4_l1']) + +# chato +data_rawq['atq_l8'] = data_rawq.groupby(['permno'])['atq'].shift(8) +data_rawq['chato'] = (data_rawq['saleq4']/((data_rawq['atq']+data_rawq['atq_l4'])/2))-(data_rawq['saleq4_l4']/((data_rawq['atq_l4']+data_rawq['atq_l8'])/2)) + +# noa +data_rawq['ivaoq'] = np.where(data_rawq['ivaoq'].isnull(), 0, 1) +data_rawq['dlcq'] = np.where(data_rawq['dlcq'].isnull(), 0, 1) +data_rawq['dlttq'] = np.where(data_rawq['dlttq'].isnull(), 0, 1) +data_rawq['mibq'] = np.where(data_rawq['mibq'].isnull(), 0, 1) +data_rawq['pstkq'] = np.where(data_rawq['pstkq'].isnull(), 0, 1) +data_rawq['noa'] = (data_rawq['atq']-data_rawq['cheq']-data_rawq['ivaoq'])-\ + (data_rawq['atq']-data_rawq['dlcq']-data_rawq['dlttq']-data_rawq['mibq']-data_rawq['pstkq']-data_rawq['ceqq'])/data_rawq['atq_l4'] + +# rna +data_rawq['noa_l4'] = data_rawq.groupby(['permno'])['noa'].shift(4) +data_rawq['rna'] = data_rawq['oiadpq']/data_rawq['noa_l4'] + +# pm +data_rawq['pm'] = data_rawq['oiadpq']/data_rawq['saleq'] + +# ato +data_rawq['ato'] = data_rawq['saleq']/data_rawq['noa_l4'] + +# roe +data_rawq['ceqq_l1'] = data_rawq.groupby(['permno'])['ceqq'].shift(1) +data_rawq['roe'] = data_rawq['ibq']/data_rawq['ceqq_l1'] + +################################## New Added ################################## + +# grltnoa +data_rawq['rectq_l4'] = data_rawq.groupby(['permno'])['rectq'].shift(4) +data_rawq['acoq_l4'] = data_rawq.groupby(['permno'])['acoq'].shift(4) +data_rawq['apq_l4'] = data_rawq.groupby(['permno'])['apq'].shift(4) +data_rawq['lcoq_l4'] = data_rawq.groupby(['permno'])['lcoq'].shift(4) +data_rawq['loq_l4'] = data_rawq.groupby(['permno'])['loq'].shift(4) +data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) + +data_rawq['grltnoa'] = ((data_rawq['rectq']+data_rawq['invtq']+data_rawq['ppentq']+data_rawq['acoq']+data_rawq['intanq']+ + data_rawq['aoq']-data_rawq['apq']-data_rawq['lcoq']-data_rawq['loq'])- + (data_rawq['rectq_l4']+data_rawq['invtq_l4']+data_rawq['ppentq_l4']+data_rawq['acoq_l4']-data_rawq['apq_l4']-data_rawq['lcoq_l4']-data_rawq['loq_l4'])-\ + (data_rawq['rectq']-data_rawq['rectq_l4']+data_rawq['invtq']-data_rawq['invtq_l4']+data_rawq['acoq']- + (data_rawq['apq']-data_rawq['apq_l4']+data_rawq['lcoq']-data_rawq['lcoq_l4'])- + ttm4('dpq', data_rawq)))/((data_rawq['atq']+data_rawq['atq_l4'])/2) + +# scal +# condlist = [data_rawq['seqq'].isnull(), +# data_rawq['seqq'].isnull() & (data_rawq['ceqq'].isnull() | data_rawq['pstk'].isnull())] +# choicelist = [data_rawq['ceqq']+data_rawq['pstk'], +# data_rawq['atq']-data_rawq['ltq']] +# data_rawq['scal'] = np.select(condlist, choicelist, default=data_rawq['seqq']) + +# ala +data_rawq['gdwlq'] = np.where(data_rawq['gdwlq'].isnull(), 0, data_rawq['gdwlq']) +data_rawq['intanq'] = np.where(data_rawq['intanq'].isnull(), 0, data_rawq['intanq']) +data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ + 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) + +# alm +# data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) + +# rsup +data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4) +# data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'] + +# stdsacc +data_rawq['actq_l1'] = data_rawq.groupby(['permno'])['actq'].shift(1) +data_rawq['cheq_l1'] = data_rawq.groupby(['permno'])['cheq'].shift(1) +data_rawq['lctq_l1'] = data_rawq.groupby(['permno'])['lctq'].shift(1) +data_rawq['dlcq_l1'] = data_rawq.groupby(['permno'])['dlcq'].shift(1) + +data_rawq['sacc'] = ((data_rawq['actq']-data_rawq['actq_l1'] - (data_rawq['cheq']-data_rawq['cheq_l1'])) + -((data_rawq['lctq']-data_rawq['lctq_l1'])-(data_rawq['dlcq']-data_rawq['dlcq_l1'])))/data_rawq['saleq'] +data_rawq['sacc'] = np.where(data_rawq['saleq']<=0, ((data_rawq['actq']-data_rawq['actq_l1'] - (data_rawq['cheq']-data_rawq['cheq_l1'])) + -((data_rawq['lctq']-data_rawq['lctq_l1'])-(data_rawq['dlcq']-data_rawq['dlcq_l1'])))/0.01, data_rawq['sacc']) + + +def chars_std(start, end, df, chars): + """ + + :param start: Order of starting lag + :param end: Order of ending lag + :param df: Dataframe + :param chars: lag chars + :return: std of factor + """ + lag = pd.DataFrame() + lag_list = [] + for i in range(start, end): + lag['chars_l%s' % i] = df.groupby(['permno'])['%s' % chars].shift(i) + lag_list.append('chars_l%s' % i) + result = lag[lag_list].std(axis=1) + return result + +data_rawq['stdacc'] = chars_std(0, 16, data_rawq, 'sacc') + +# sgrvol +# data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') + +# roavol +data_rawq['roavol'] = chars_std(0, 16, data_rawq, 'roa') + +# stdcf +data_rawq['scf'] = (data_rawq['ibq']/data_rawq['saleq']) - data_rawq['sacc'] +data_rawq['scf'] = np.where(data_rawq['saleq']<=0, (data_rawq['ibq']/0.01) - data_rawq['sacc'], data_rawq['sacc']) + +data_rawq['stdcf'] = chars_std(0, 16, data_rawq, 'scf') + +# cinvest +data_rawq['ppentq_l1'] = data_rawq.groupby(['permno'])['ppentq'].shift(1) +data_rawq['ppentq_l2'] = data_rawq.groupby(['permno'])['ppentq'].shift(2) +data_rawq['ppentq_l3'] = data_rawq.groupby(['permno'])['ppentq'].shift(3) +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['saleq_l1'] = data_rawq.groupby(['permno'])['saleq'].shift(1) +data_rawq['saleq_l2'] = data_rawq.groupby(['permno'])['saleq'].shift(2) +data_rawq['saleq_l3'] = data_rawq.groupby(['permno'])['saleq'].shift(3) + +data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / data_rawq['saleq_l1'] +data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / data_rawq['saleq_l2'] +data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / data_rawq['saleq_l3'] + +data_rawq['cinvest'] = ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / data_rawq['saleq'])\ + -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)) + +data_rawq['c_temp1'] = (data_rawq['ppentq_l1'] - data_rawq['ppentq_l2']) / 0.01 +data_rawq['c_temp2'] = (data_rawq['ppentq_l2'] - data_rawq['ppentq_l3']) / 0.01 +data_rawq['c_temp3'] = (data_rawq['ppentq_l3'] - data_rawq['ppentq_l4']) / 0.01 + +data_rawq['cinvest'] = np.where(data_rawq['saleq']<=0, ((data_rawq['ppentq'] - data_rawq['ppentq_l1']) / 0.01) + -(data_rawq[['c_temp1', 'c_temp2', 'c_temp3']].mean(axis=1)), data_rawq['cinvest']) + +data_rawq = data_rawq.drop(['c_temp1', 'c_temp2', 'c_temp3'], axis=1) + +# nincr +data_rawq['ibq_l1'] = data_rawq.groupby(['permno'])['ibq'].shift(1) +data_rawq['ibq_l2'] = data_rawq.groupby(['permno'])['ibq'].shift(2) +data_rawq['ibq_l3'] = data_rawq.groupby(['permno'])['ibq'].shift(3) +data_rawq['ibq_l4'] = data_rawq.groupby(['permno'])['ibq'].shift(4) +data_rawq['ibq_l5'] = data_rawq.groupby(['permno'])['ibq'].shift(5) +data_rawq['ibq_l6'] = data_rawq.groupby(['permno'])['ibq'].shift(6) +data_rawq['ibq_l7'] = data_rawq.groupby(['permno'])['ibq'].shift(7) +data_rawq['ibq_l8'] = data_rawq.groupby(['permno'])['ibq'].shift(8) + +data_rawq['nincr_temp1'] = np.where(data_rawq['ibq'] > data_rawq['ibq_l1'], 1, 0) +data_rawq['nincr_temp2'] = np.where(data_rawq['ibq_l1'] > data_rawq['ibq_l2'], 1, 0) +data_rawq['nincr_temp3'] = np.where(data_rawq['ibq_l2'] > data_rawq['ibq_l3'], 1, 0) +data_rawq['nincr_temp4'] = np.where(data_rawq['ibq_l3'] > data_rawq['ibq_l4'], 1, 0) +data_rawq['nincr_temp5'] = np.where(data_rawq['ibq_l4'] > data_rawq['ibq_l5'], 1, 0) +data_rawq['nincr_temp6'] = np.where(data_rawq['ibq_l5'] > data_rawq['ibq_l6'], 1, 0) +data_rawq['nincr_temp7'] = np.where(data_rawq['ibq_l6'] > data_rawq['ibq_l7'], 1, 0) +data_rawq['nincr_temp8'] = np.where(data_rawq['ibq_l7'] > data_rawq['ibq_l8'], 1, 0) + +data_rawq['nincr'] = (data_rawq['nincr_temp1'] + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']) + + (data_rawq['nincr_temp1']*data_rawq['nincr_temp2']*data_rawq['nincr_temp3']*data_rawq['nincr_temp4']*data_rawq['nincr_temp5']*data_rawq['nincr_temp6']*data_rawq['nincr_temp7']*data_rawq['nincr_temp8'])) + +data_rawq = data_rawq.drop(['ibq_l1', 'ibq_l2', 'ibq_l3', 'ibq_l4', 'ibq_l5', 'ibq_l6', 'ibq_l7', 'ibq_l8', 'nincr_temp1', + 'nincr_temp2', 'nincr_temp3', 'nincr_temp4', 'nincr_temp5', 'nincr_temp6', 'nincr_temp7', + 'nincr_temp8'], axis=1) + +# performance score +data_rawq['niq4'] = ttm4(series='niq', df=data_rawq) +data_rawq['niq4_l4'] = data_rawq.groupby(['permno'])['niq4'].shift(4) +data_rawq['dlttq_l4'] = data_rawq.groupby(['permno'])['dlttq'].shift(4) +data_rawq['p_temp1'] = np.where(data_rawq['niq4']>0, 1, 0) +data_rawq['p_temp2'] = np.where(data_rawq['oancfy']>0, 1, 0) +data_rawq['p_temp3'] = np.where(data_rawq['niq4']/data_rawq['atq']>data_rawq['niq4_l4']/data_rawq['atq_l4'], 1, 0) +data_rawq['p_temp4'] = np.where(data_rawq['oancfy']>data_rawq['niq4'], 1, 0) +data_rawq['p_temp5'] = np.where(data_rawq['dlttq']/data_rawq['atq'] data_rawq['actq_l4']/data_rawq['lctq_l4'], 1, 0) +data_rawq['cogsq4_l4'] = data_rawq.groupby(['permno'])['cogsq4'].shift(4) +data_rawq['p_temp7'] = np.where((data_rawq['saleq4']-data_rawq['cogsq4']/data_rawq['saleq4'])>(data_rawq['saleq4_l4']-data_rawq['cogsq4_l4']/data_rawq['saleq4_l4']), 1, 0) +data_rawq['p_temp8'] = np.where(data_rawq['saleq4']/data_rawq['atq']>data_rawq['saleq4_l4']/data_rawq['atq_l4'], 1, 0) +data_rawq['p_temp9'] = np.where(data_rawq['scstkcy']==0, 1, 0) + +data_rawq['pscore'] = data_rawq['p_temp1']+data_rawq['p_temp2']+data_rawq['p_temp3']+data_rawq['p_temp4']\ + +data_rawq['p_temp5']+data_rawq['p_temp6']+data_rawq['p_temp7']+data_rawq['p_temp8']\ + +data_rawq['p_temp9'] + +data_rawq = data_rawq.drop(['p_temp1', 'p_temp2', 'p_temp3', 'p_temp4', 'p_temp5', 'p_temp6', 'p_temp7', 'p_temp8', + 'p_temp9'], axis=1) + +####################################################################################################################### +# Momentum # +####################################################################################################################### +crsp_mom = conn.raw_sql(""" + select permno, date, ret, retx, prc, shrout, vol + from crsp.msf + where date >= '01/01/1959' + """) + +crsp_mom['permno'] = crsp_mom['permno'].astype(int) +crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) +crsp_mom = crsp_mom.dropna(subset=['ret', 'retx', 'prc']) + +# add delisting return +dlret = conn.raw_sql(""" + select permno, dlret, dlstdt + from crsp.msedelist + """) + +dlret.permno = dlret.permno.astype(int) +dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) +dlret['jdate'] = dlret['dlstdt'] + MonthEnd(0) + +# merge delisting return to crsp return +crsp_mom = pd.merge(crsp_mom, dlret, how='left', on=['permno', 'jdate']) +crsp_mom['dlret'] = crsp_mom['dlret'].fillna(0) +crsp_mom['ret'] = crsp_mom['ret'].fillna(0) +crsp_mom['retadj'] = (1 + crsp_mom['ret']) * (1 + crsp_mom['dlret']) - 1 +crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity + + +def mom(start, end, df): + """ + + :param start: Order of starting lag + :param end: Order of ending lag + :param df: Dataframe + :return: Momentum factor + """ + lag = pd.DataFrame() + result = 1 + for i in range(start, end): + lag['mom%s' % i] = df.groupby(['permno'])['ret'].shift(i) + result = result * (1+lag['mom%s' % i]) + result = result - 1 + return result + + +crsp_mom['mom60m'] = mom(12, 60, crsp_mom) +crsp_mom['mom12m'] = mom(1, 12, crsp_mom) +crsp_mom['mom1m'] = crsp_mom['ret'] +crsp_mom['mom6m'] = mom(1, 6, crsp_mom) +crsp_mom['mom36m'] = mom(1, 36, crsp_mom) +crsp_mom['seas1a'] = crsp_mom.groupby(['permno'])['ret'].shift(11) + +crsp_mom['vol_l1'] = crsp_mom.groupby(['permno'])['vol'].shift(1) +crsp_mom['vol_l2'] = crsp_mom.groupby(['permno'])['vol'].shift(2) +crsp_mom['vol_l3'] = crsp_mom.groupby(['permno'])['vol'].shift(3) +crsp_mom['prc_l2'] = crsp_mom.groupby(['permno'])['prc'].shift(2) +crsp_mom['dolvol'] = np.log(crsp_mom['vol_l2']*crsp_mom['prc_l2']).replace([np.inf, -np.inf], np.nan) +crsp_mom['turn'] = ((crsp_mom['vol_l1']+crsp_mom['vol_l2']+crsp_mom['vol_l3'])/3)/crsp_mom['shrout'] + +# dy +crsp_mom['me_l1'] = crsp_mom.groupby(['permno'])['me'].shift(1) +crsp_mom['retdy'] = crsp_mom['ret'] - crsp_mom['retx'] +crsp_mom['mdivpay'] = crsp_mom['retdy']*crsp_mom['me_l1'] + +crsp_mom['dy'] = ttm12(series='mdivpay', df=crsp_mom)/crsp_mom['me'] + +# def moms(start, end, df): +# """ +# +# :param start: Order of starting lag +# :param end: Order of ending lag +# :param df: Dataframe +# :return: Momentum factor +# """ +# lag = pd.DataFrame() +# result = 1 +# for i in range(start, end): +# lag['moms%s' % i] = df.groupby['permno']['ret'].shift(i) +# result = result + lag['moms%s' % i] +# result = result/11 +# return result +# +# +# crsp_mom['moms12m'] = moms(1, 12, crsp_mom) + +# populate the chars to monthly + +# data_rawa +data_rawa = data_rawa.drop(['date', 'ret', 'retx', 'me'], axis=1) +data_rawa = pd.merge(crsp_mom, data_rawa, how='left', on=['permno', 'jdate']) +data_rawa['datadate'] = data_rawa.groupby(['permno'])['datadate'].fillna(method='ffill') +data_rawa = data_rawa.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') +data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & + ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] + +# data_rawq +data_rawq = data_rawq.drop(['date', 'ret', 'retx', 'me'], axis=1) +data_rawq = pd.merge(crsp_mom, data_rawq, how='left', on=['permno', 'jdate']) +data_rawq['datadate'] = data_rawq.groupby(['permno'])['datadate'].fillna(method='ffill') +data_rawq = data_rawq.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') +data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & + ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] + +####################################################################################################################### +# Monthly ME # +####################################################################################################################### + +######################################## +# Annual # +######################################## + +# bm +data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] + +# bm_ia +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['bm'].mean() +df_temp = df_temp.rename(columns={'bm': 'bm_ind'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +data_rawa['bm_ia'] = data_rawa['bm']/data_rawa['bm_ind'] + +# me_ia +df_temp = data_rawa.groupby(['datadate', 'ffi49'], as_index=False)['me'].mean() +df_temp = df_temp.rename(columns={'me': 'me_ind'}) +data_rawa = pd.merge(data_rawa, df_temp, how='left', on=['datadate', 'ffi49']) +data_rawa['me_ia'] = data_rawa['me']/data_rawa['me_ind'] + +# cfp +condlist = [data_rawa['dp'].isnull(), + data_rawa['ib'].isnull()] +choicelist = [data_rawa['ib']/data_rawa['me'], + np.nan] +data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) + +# ep +data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] + +# rsup +# data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) +data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] + +# lev +data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] + +# sp +data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] + +# rdm +data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] + +# adm hxz adm +data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] + +# dy +data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] + +# Annual Accounting Variables +chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', + 'sic', 'ret', 'retx', 'retadj', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', + 'rsup', 'cash', 'chcsho', + 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'adm', 'sgr', 'sp', 'invest', 'roe', + 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'chato', 'chtx', 'noa', 'rna', 'pm', 'ato', 'dy', + 'roic', 'chinv', 'pchsale_pchinvt', 'pchsale_pchrect', 'pchgm_pchsale', 'pchsale_pchxsga', + 'pchdepr', 'chadv', 'pchcapx', 'grcapx', 'grGW', 'currat', 'pchcurrat', 'quick', 'pchquick', + 'salecash', 'salerec', 'saleinv', 'pchsaleinv', 'realestate', 'obklg', 'chobklg', 'grltnoa', + 'conv', 'chdrc', 'rdbias', 'operprof', 'capxint', 'xadint', 'chpm', 'ala', 'alm', + 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'hire', 'herf', 'bm_ia', + 'me_ia', 'turn', 'dolvol']] +chars_a.reset_index(drop=True, inplace=True) + +######################################## +# Quarterly # +######################################## +# bm +data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] + +# cfp +data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), + data_rawq['ibq4']/data_rawq['me'], + (data_rawq['ibq4']+data_rawq['dpq4'])/data_rawq['me']) + +# ep +data_rawq['ep'] = data_rawq['ibq4']/data_rawq['me'] + +# lev +data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] + +# rdm +data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] + +# sp +data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] + +# alm +data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) + +# rsup +# data_rawq['saleq_l4'] = data_rawq.groupby(['permno'])['saleq'].shift(4) +data_rawq['rsup'] = (data_rawq['saleq'] - data_rawq['saleq_l4'])/data_rawq['me'] + +# sgrvol +data_rawq['sgrvol'] = chars_std(0, 15, data_rawq, 'rsup') + +# Quarterly Accounting Variables +chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd', + 'ret', 'retx', 'retadj', 'acc', 'bm', 'cfp', + 'ep', 'agr', 'ni', 'op', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', + 'rdm', 'sgr', 'sp', 'invest', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'roe', + 'chato', 'chpm', 'chtx', 'noa', 'rna', 'pm', 'ato', 'stdcf', + 'grltnoa', 'ala', 'alm', 'rsup', 'stdacc', 'sgrvol', 'roavol', 'scf', 'cinvest', + 'mom1m', 'mom6m', 'mom12m', 'mom60m', 'mom36m', 'seas1a', 'me', 'pscore', 'nincr', + 'turn', 'dolvol']] +chars_q.reset_index(drop=True, inplace=True) + +with open('chars_a_60.pkl', 'wb') as f: + pkl.dump(chars_a, f) + +with open('chars_q_60.pkl', 'wb') as f: + pkl.dump(chars_q, f) diff --git a/char60/beta.py b/char60/beta.py new file mode 100755 index 0000000..ff5cca5 --- /dev/null +++ b/char60/beta.py @@ -0,0 +1,164 @@ +# Fama & French 3 factors residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf, b.smb, b.hml + from crsp.dsf as a + left join ff.factors_daily as b + on a.date=b.date + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate the beta # +###################### + + +def get_beta(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + # if observations in last 3 months are less 21, we drop the rvar of this month + if temp['permno'].count() < 21: + pass + else: + rolling_window = temp['permno'].count() + index = temp.tail(1).index + X = np.mat(temp[['mktrf']]) + Y = np.mat(temp[['exret']]) + ones = np.mat(np.ones(rolling_window)).T + M = np.identity(rolling_window) - ones.dot((ones.T.dot(ones)).I).dot(ones.T) + beta = (X.T.dot(M).dot(X)).I.dot((X.T.dot(M).dot(Y))) + df.loc[index, 'beta'] = beta + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_beta, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['beta']) # drop NA due to rolling +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'beta']] + +with open('beta.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/bid_ask_spread.py b/char60/bid_ask_spread.py new file mode 100755 index 0000000..5281099 --- /dev/null +++ b/char60/bid_ask_spread.py @@ -0,0 +1,160 @@ +# Fama & French 3 factors residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, a.askhi, a.bidlo + from crsp.dsf as a + left join ff.factors_daily as b + on a.date=b.date + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate residual # +###################### + + +def get_baspread(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + if temp['permno'].count() < 21: + pass + else: + index = temp.tail(1).index + X = pd.DataFrame() + X[['askhi', 'bidlo']] = temp[['askhi', 'bidlo']] + bid = (X['askhi'] - X['bidlo'])/((X['askhi'] + X['bidlo'])/2).mean() + df.loc[index, 'baspread'] = bid + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['baspread']) # drop NA due to rolling +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'baspread']] + +with open('baspread.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/functions.py b/char60/functions.py new file mode 100755 index 0000000..34cd370 --- /dev/null +++ b/char60/functions.py @@ -0,0 +1,452 @@ +import pandas as pd +import pickle as pkl +import numpy as np +from tqdm import tqdm +import re + +def ffi49(df): + condlist = [((100 <= df['sic']) & (df['sic'] <= 199)) | ((200 <= df['sic']) & (df['sic'] <= 299)) | + ((700 <= df['sic']) & (df['sic'] <= 799)) | ((910 <= df['sic']) & (df['sic'] <= 919)) | + ((2048 <= df['sic']) & (df['sic'] <= 2048)), + ((2000 <= df['sic']) & (df['sic'] <= 2009)) | ((2010 <= df['sic']) & (df['sic'] <= 2019)) | + ((2020 <= df['sic']) & (df['sic'] <= 2029)) | ((2030 <= df['sic']) & (df['sic'] <= 2039)) | + ((2040 <= df['sic']) & (df['sic'] <= 2046)) | ((2050 <= df['sic']) & (df['sic'] <= 2059)) | + ((2060 <= df['sic']) & (df['sic'] <= 2063)) | ((2070 <= df['sic']) & (df['sic'] <= 2079)) | + ((2090 <= df['sic']) & (df['sic'] <= 2092)) | ((2095 <= df['sic']) & (df['sic'] <= 2095)) | + ((2098 <= df['sic']) & (df['sic'] <= 2099)), + ((2064 <= df['sic']) & (df['sic'] <= 2068)) | ((2086 <= df['sic']) & (df['sic'] <= 2086)) | + ((2087 <= df['sic']) & (df['sic'] <= 2087)) | ((2096 <= df['sic']) & (df['sic'] <= 2096)) | + ((2097 <= df['sic']) & (df['sic'] <= 2097)), + ((2080 <= df['sic']) & (df['sic'] <= 2080)) | ((2082 <= df['sic']) & (df['sic'] <= 2082)) | + ((2083 <= df['sic']) & (df['sic'] <= 2083)) | ((2084 <= df['sic']) & (df['sic'] <= 2084)) | + ((2085 <= df['sic']) & (df['sic'] <= 2085)), + ((2100 <= df['sic']) & (df['sic'] <= 2199)), + ((920 <= df['sic']) & (df['sic'] <= 999)) | ((3650 <= df['sic']) & (df['sic'] <= 3651)) | + ((3652 <= df['sic']) & (df['sic'] <= 3652)) | ((3732 <= df['sic']) & (df['sic'] <= 3732)) | + ((3930 <= df['sic']) & (df['sic'] <= 3931)) | ((3940 <= df['sic']) & (df['sic'] <= 3949)), + ((7800 <= df['sic']) & (df['sic'] <= 7829)) | ((7830 <= df['sic']) & (df['sic'] <= 7833)) | + ((7840 <= df['sic']) & (df['sic'] <= 7841)) | ((7900 <= df['sic']) & (df['sic'] <= 7900)) | + ((7910 <= df['sic']) & (df['sic'] <= 7911)) | ((7920 <= df['sic']) & (df['sic'] <= 7929)) | + ((7930 <= df['sic']) & (df['sic'] <= 7933)) | ((7940 <= df['sic']) & (df['sic'] <= 7949)) | + ((7980 <= df['sic']) & (df['sic'] <= 7980)) | ((7990 <= df['sic']) & (df['sic'] <= 7999)), + ((2700 <= df['sic']) & (df['sic'] <= 2709)) | ((2710 <= df['sic']) & (df['sic'] <= 2719)) | + ((2720 <= df['sic']) & (df['sic'] <= 2729)) | ((2730 <= df['sic']) & (df['sic'] <= 2739)) | + ((2740 <= df['sic']) & (df['sic'] <= 2749)) | ((2770 <= df['sic']) & (df['sic'] <= 2771)) | + ((2780 <= df['sic']) & (df['sic'] <= 2789)) | ((2790 <= df['sic']) & (df['sic'] <= 2799)), + ((2047 <= df['sic']) & (df['sic'] <= 2047)) | ((2391 <= df['sic']) & (df['sic'] <= 2392)) | + ((2510 <= df['sic']) & (df['sic'] <= 2519)) | ((2590 <= df['sic']) & (df['sic'] <= 2599)) | + ((2840 <= df['sic']) & (df['sic'] <= 2843)) | ((2844 <= df['sic']) & (df['sic'] <= 2844)) | + ((3160 <= df['sic']) & (df['sic'] <= 3161)) | ((3170 <= df['sic']) & (df['sic'] <= 3171)) | + ((3172 <= df['sic']) & (df['sic'] <= 3172)) | ((3190 <= df['sic']) & (df['sic'] <= 3199)) | + ((3229 <= df['sic']) & (df['sic'] <= 3229)) | ((3260 <= df['sic']) & (df['sic'] <= 3260)) | + ((3262 <= df['sic']) & (df['sic'] <= 3263)) | ((3269 <= df['sic']) & (df['sic'] <= 3269)) | + ((3230 <= df['sic']) & (df['sic'] <= 3231)) | ((3630 <= df['sic']) & (df['sic'] <= 3639)) | + ((3750 <= df['sic']) & (df['sic'] <= 3751)) | ((3800 <= df['sic']) & (df['sic'] <= 3800)) | + ((3860 <= df['sic']) & (df['sic'] <= 3861)) | ((3870 <= df['sic']) & (df['sic'] <= 3873)) | + ((3910 <= df['sic']) & (df['sic'] <= 3911)) | ((3914 <= df['sic']) & (df['sic'] <= 3914)) | + ((3915 <= df['sic']) & (df['sic'] <= 3915)) | ((3960 <= df['sic']) & (df['sic'] <= 3962)) | + ((3991 <= df['sic']) & (df['sic'] <= 3991)) | ((3995 <= df['sic']) & (df['sic'] <= 3995)), + ((2300 <= df['sic']) & (df['sic'] <= 2390)) | ((3020 <= df['sic']) & (df['sic'] <= 3021)) | + ((3100 <= df['sic']) & (df['sic'] <= 3111)) | ((3130 <= df['sic']) & (df['sic'] <= 3131)) | + ((3140 <= df['sic']) & (df['sic'] <= 3149)) | ((3150 <= df['sic']) & (df['sic'] <= 3151)) | + ((3963 <= df['sic']) & (df['sic'] <= 3965)), + ((8000 <= df['sic']) & (df['sic'] <= 8099)), + ((3693 <= df['sic']) & (df['sic'] <= 3693)) | ((3840 <= df['sic']) & (df['sic'] <= 3849)) | + ((3850 <= df['sic']) & (df['sic'] <= 3851)), + ((2830 <= df['sic']) & (df['sic'] <= 2830)) | ((2831 <= df['sic']) & (df['sic'] <= 2831)) | + ((2833 <= df['sic']) & (df['sic'] <= 2833)) | ((2834 <= df['sic']) & (df['sic'] <= 2834)) | + ((2835 <= df['sic']) & (df['sic'] <= 2835)) | ((2836 <= df['sic']) & (df['sic'] <= 2836)), + ((2800 <= df['sic']) & (df['sic'] <= 2809)) | ((2810 <= df['sic']) & (df['sic'] <= 2819)) | + ((2820 <= df['sic']) & (df['sic'] <= 2829)) | ((2850 <= df['sic']) & (df['sic'] <= 2859)) | + ((2860 <= df['sic']) & (df['sic'] <= 2869)) | ((2870 <= df['sic']) & (df['sic'] <= 2879)) | + ((2890 <= df['sic']) & (df['sic'] <= 2899)), + ((3031 <= df['sic']) & (df['sic'] <= 3031)) | ((3041 <= df['sic']) & (df['sic'] <= 3041)) | + ((3050 <= df['sic']) & (df['sic'] <= 3053)) | ((3060 <= df['sic']) & (df['sic'] <= 3069)) | + ((3070 <= df['sic']) & (df['sic'] <= 3079)) | ((3080 <= df['sic']) & (df['sic'] <= 3089)) | + ((3090 <= df['sic']) & (df['sic'] <= 3099)), + ((2200 <= df['sic']) & (df['sic'] <= 2269)) | ((2270 <= df['sic']) & (df['sic'] <= 2279)) | + ((2280 <= df['sic']) & (df['sic'] <= 2284)) | ((2290 <= df['sic']) & (df['sic'] <= 2295)) | + ((2297 <= df['sic']) & (df['sic'] <= 2297)) | ((2298 <= df['sic']) & (df['sic'] <= 2298)) | + ((2299 <= df['sic']) & (df['sic'] <= 2299)) | ((2393 <= df['sic']) & (df['sic'] <= 2395)) | + ((2397 <= df['sic']) & (df['sic'] <= 2399)), + ((800 <= df['sic']) & (df['sic'] <= 899)) | ((2400 <= df['sic']) & (df['sic'] <= 2439)) | + ((2450 <= df['sic']) & (df['sic'] <= 2459)) | ((2490 <= df['sic']) & (df['sic'] <= 2499)) | + ((2660 <= df['sic']) & (df['sic'] <= 2661)) | ((2950 <= df['sic']) & (df['sic'] <= 2952)) | + ((3200 <= df['sic']) & (df['sic'] <= 3200)) | ((3210 <= df['sic']) & (df['sic'] <= 3211)) | + ((3240 <= df['sic']) & (df['sic'] <= 3241)) | ((3250 <= df['sic']) & (df['sic'] <= 3259)) | + ((3261 <= df['sic']) & (df['sic'] <= 3261)) | ((3264 <= df['sic']) & (df['sic'] <= 3264)) | + ((3270 <= df['sic']) & (df['sic'] <= 3275)) | ((3280 <= df['sic']) & (df['sic'] <= 3281)) | + ((3290 <= df['sic']) & (df['sic'] <= 3293)) | ((3295 <= df['sic']) & (df['sic'] <= 3299)) | + ((3420 <= df['sic']) & (df['sic'] <= 3429)) | ((3430 <= df['sic']) & (df['sic'] <= 3433)) | + ((3440 <= df['sic']) & (df['sic'] <= 3441)) | ((3442 <= df['sic']) & (df['sic'] <= 3442)) | + ((3446 <= df['sic']) & (df['sic'] <= 3446)) | ((3448 <= df['sic']) & (df['sic'] <= 3448)) | + ((3449 <= df['sic']) & (df['sic'] <= 3449)) | ((3450 <= df['sic']) & (df['sic'] <= 3451)) | + ((3452 <= df['sic']) & (df['sic'] <= 3452)) | ((3490 <= df['sic']) & (df['sic'] <= 3499)) | + ((3996 <= df['sic']) & (df['sic'] <= 3996)), + ((1500 <= df['sic']) & (df['sic'] <= 1511)) | ((1520 <= df['sic']) & (df['sic'] <= 1529)) | + ((1530 <= df['sic']) & (df['sic'] <= 1539)) | ((1540 <= df['sic']) & (df['sic'] <= 1549)) | + ((1600 <= df['sic']) & (df['sic'] <= 1699)) | ((1700 <= df['sic']) & (df['sic'] <= 1799)), + ((3300 <= df['sic']) & (df['sic'] <= 3300)) | ((3310 <= df['sic']) & (df['sic'] <= 3317)) | + ((3320 <= df['sic']) & (df['sic'] <= 3325)) | ((3330 <= df['sic']) & (df['sic'] <= 3339)) | + ((3340 <= df['sic']) & (df['sic'] <= 3341)) | ((3350 <= df['sic']) & (df['sic'] <= 3357)) | + ((3360 <= df['sic']) & (df['sic'] <= 3369)) | ((3370 <= df['sic']) & (df['sic'] <= 3379)) | + ((3390 <= df['sic']) & (df['sic'] <= 3399)), + ((3400 <= df['sic']) & (df['sic'] <= 3400)) | ((3443 <= df['sic']) & (df['sic'] <= 3443)) | + ((3444 <= df['sic']) & (df['sic'] <= 3444)) | ((3460 <= df['sic']) & (df['sic'] <= 3469)) | + ((3470 <= df['sic']) & (df['sic'] <= 3479)), + ((3510 <= df['sic']) & (df['sic'] <= 3519)) | ((3520 <= df['sic']) & (df['sic'] <= 3529)) | + ((3530 <= df['sic']) & (df['sic'] <= 3530)) | ((3531 <= df['sic']) & (df['sic'] <= 3531)) | + ((3532 <= df['sic']) & (df['sic'] <= 3532)) | ((3533 <= df['sic']) & (df['sic'] <= 3533)) | + ((3534 <= df['sic']) & (df['sic'] <= 3534)) | ((3535 <= df['sic']) & (df['sic'] <= 3535)) | + ((3536 <= df['sic']) & (df['sic'] <= 3536)) | ((3538 <= df['sic']) & (df['sic'] <= 3538)) | + ((3540 <= df['sic']) & (df['sic'] <= 3549)) | ((3550 <= df['sic']) & (df['sic'] <= 3559)) | + ((3560 <= df['sic']) & (df['sic'] <= 3569)) | ((3580 <= df['sic']) & (df['sic'] <= 3580)) | + ((3581 <= df['sic']) & (df['sic'] <= 3581)) | ((3582 <= df['sic']) & (df['sic'] <= 3582)) | + ((3585 <= df['sic']) & (df['sic'] <= 3585)) | ((3586 <= df['sic']) & (df['sic'] <= 3586)) | + ((3589 <= df['sic']) & (df['sic'] <= 3589)) | ((3590 <= df['sic']) & (df['sic'] <= 3599)), + ((3600 <= df['sic']) & (df['sic'] <= 3600)) | ((3610 <= df['sic']) & (df['sic'] <= 3613)) | + ((3620 <= df['sic']) & (df['sic'] <= 3621)) | ((3623 <= df['sic']) & (df['sic'] <= 3629)) | + ((3640 <= df['sic']) & (df['sic'] <= 3644)) | ((3645 <= df['sic']) & (df['sic'] <= 3645)) | + ((3646 <= df['sic']) & (df['sic'] <= 3646)) | ((3648 <= df['sic']) & (df['sic'] <= 3649)) | + ((3660 <= df['sic']) & (df['sic'] <= 3660)) | ((3690 <= df['sic']) & (df['sic'] <= 3690)) | + ((3691 <= df['sic']) & (df['sic'] <= 3692)) | ((3699 <= df['sic']) & (df['sic'] <= 3699)), + ((2296 <= df['sic']) & (df['sic'] <= 2296)) | ((2396 <= df['sic']) & (df['sic'] <= 2396)) | + ((3010 <= df['sic']) & (df['sic'] <= 3011)) | ((3537 <= df['sic']) & (df['sic'] <= 3537)) | + ((3647 <= df['sic']) & (df['sic'] <= 3647)) | ((3694 <= df['sic']) & (df['sic'] <= 3694)) | + ((3700 <= df['sic']) & (df['sic'] <= 3700)) | ((3710 <= df['sic']) & (df['sic'] <= 3710)) | + ((3711 <= df['sic']) & (df['sic'] <= 3711)) | ((3713 <= df['sic']) & (df['sic'] <= 3713)) | + ((3714 <= df['sic']) & (df['sic'] <= 3714)) | ((3715 <= df['sic']) & (df['sic'] <= 3715)) | + ((3716 <= df['sic']) & (df['sic'] <= 3716)) | ((3792 <= df['sic']) & (df['sic'] <= 3792)) | + ((3790 <= df['sic']) & (df['sic'] <= 3791)) | ((3799 <= df['sic']) & (df['sic'] <= 3799)), + ((3720 <= df['sic']) & (df['sic'] <= 3720)) | ((3721 <= df['sic']) & (df['sic'] <= 3721)) | + ((3723 <= df['sic']) & (df['sic'] <= 3724)) | ((3725 <= df['sic']) & (df['sic'] <= 3725)) | + ((3728 <= df['sic']) & (df['sic'] <= 3729)), + ((3730 <= df['sic']) & (df['sic'] <= 3731)) | ((3740 <= df['sic']) & (df['sic'] <= 3743)), + ((3760 <= df['sic']) & (df['sic'] <= 3769)) | ((3795 <= df['sic']) & (df['sic'] <= 3795)) | + ((3480 <= df['sic']) & (df['sic'] <= 3489)), + ((1040 <= df['sic']) & (df['sic'] <= 1049)), + ((1000 <= df['sic']) & (df['sic'] <= 1009)) | ((1010 <= df['sic']) & (df['sic'] <= 1019)) | + ((1020 <= df['sic']) & (df['sic'] <= 1029)) | ((1030 <= df['sic']) & (df['sic'] <= 1039)) | + ((1050 <= df['sic']) & (df['sic'] <= 1059)) | ((1060 <= df['sic']) & (df['sic'] <= 1069)) | + ((1070 <= df['sic']) & (df['sic'] <= 1079)) | ((1080 <= df['sic']) & (df['sic'] <= 1089)) | + ((1090 <= df['sic']) & (df['sic'] <= 1099)) | ((1100 <= df['sic']) & (df['sic'] <= 1119)) | + ((1400 <= df['sic']) & (df['sic'] <= 1499)), + ((1200 <= df['sic']) & (df['sic'] <= 1299)), + ((1300 <= df['sic']) & (df['sic'] <= 1300)) | ((1310 <= df['sic']) & (df['sic'] <= 1319)) | + ((1320 <= df['sic']) & (df['sic'] <= 1329)) | ((1330 <= df['sic']) & (df['sic'] <= 1339)) | + ((1370 <= df['sic']) & (df['sic'] <= 1379)) | ((1380 <= df['sic']) & (df['sic'] <= 1380)) | + ((1381 <= df['sic']) & (df['sic'] <= 1381)) | ((1382 <= df['sic']) & (df['sic'] <= 1382)) | + ((1389 <= df['sic']) & (df['sic'] <= 1389)) | ((2900 <= df['sic']) & (df['sic'] <= 2912)) | + ((2990 <= df['sic']) & (df['sic'] <= 2999)), + ((4900 <= df['sic']) & (df['sic'] <= 4900)) | ((4910 <= df['sic']) & (df['sic'] <= 4911)) | + ((4920 <= df['sic']) & (df['sic'] <= 4922)) | ((4923 <= df['sic']) & (df['sic'] <= 4923)) | + ((4924 <= df['sic']) & (df['sic'] <= 4925)) | ((4930 <= df['sic']) & (df['sic'] <= 4931)) | + ((4932 <= df['sic']) & (df['sic'] <= 4932)) | ((4939 <= df['sic']) & (df['sic'] <= 4939)) | + ((4940 <= df['sic']) & (df['sic'] <= 4942)), + ((4800 <= df['sic']) & (df['sic'] <= 4800)) | ((4810 <= df['sic']) & (df['sic'] <= 4813)) | + ((4820 <= df['sic']) & (df['sic'] <= 4822)) | ((4830 <= df['sic']) & (df['sic'] <= 4839)) | + ((4840 <= df['sic']) & (df['sic'] <= 4841)) | ((4880 <= df['sic']) & (df['sic'] <= 4889)) | + ((4890 <= df['sic']) & (df['sic'] <= 4890)) | ((4891 <= df['sic']) & (df['sic'] <= 4891)) | + ((4892 <= df['sic']) & (df['sic'] <= 4892)) | ((4899 <= df['sic']) & (df['sic'] <= 4899)), + ((7020 <= df['sic']) & (df['sic'] <= 7021)) | ((7030 <= df['sic']) & (df['sic'] <= 7033)) | + ((7200 <= df['sic']) & (df['sic'] <= 7200)) | ((7210 <= df['sic']) & (df['sic'] <= 7212)) | + ((7214 <= df['sic']) & (df['sic'] <= 7214)) | ((7215 <= df['sic']) & (df['sic'] <= 7216)) | + ((7217 <= df['sic']) & (df['sic'] <= 7217)) | ((7219 <= df['sic']) & (df['sic'] <= 7219)) | + ((7220 <= df['sic']) & (df['sic'] <= 7221)) | ((7230 <= df['sic']) & (df['sic'] <= 7231)) | + ((7240 <= df['sic']) & (df['sic'] <= 7241)) | ((7250 <= df['sic']) & (df['sic'] <= 7251)) | + ((7260 <= df['sic']) & (df['sic'] <= 7269)) | ((7270 <= df['sic']) & (df['sic'] <= 7290)) | + ((7291 <= df['sic']) & (df['sic'] <= 7291)) | ((7292 <= df['sic']) & (df['sic'] <= 7299)) | + ((7395 <= df['sic']) & (df['sic'] <= 7395)) | ((7500 <= df['sic']) & (df['sic'] <= 7500)) | + ((7520 <= df['sic']) & (df['sic'] <= 7529)) | ((7530 <= df['sic']) & (df['sic'] <= 7539)) | + ((7540 <= df['sic']) & (df['sic'] <= 7549)) | ((7600 <= df['sic']) & (df['sic'] <= 7600)) | + ((7620 <= df['sic']) & (df['sic'] <= 7620)) | ((7622 <= df['sic']) & (df['sic'] <= 7622)) | + ((7623 <= df['sic']) & (df['sic'] <= 7623)) | ((7629 <= df['sic']) & (df['sic'] <= 7629)) | + ((7630 <= df['sic']) & (df['sic'] <= 7631)) | ((7640 <= df['sic']) & (df['sic'] <= 7641)) | + ((7690 <= df['sic']) & (df['sic'] <= 7699)) | ((8100 <= df['sic']) & (df['sic'] <= 8199)) | + ((8200 <= df['sic']) & (df['sic'] <= 8299)) | ((8300 <= df['sic']) & (df['sic'] <= 8399)) | + ((8400 <= df['sic']) & (df['sic'] <= 8499)) | ((8600 <= df['sic']) & (df['sic'] <= 8699)) | + ((8800 <= df['sic']) & (df['sic'] <= 8899)) | ((7510 <= df['sic']) & (df['sic'] <= 7515)), + ((2750 <= df['sic']) & (df['sic'] <= 2759)) | ((3993 <= df['sic']) & (df['sic'] <= 3993)) | + ((7218 <= df['sic']) & (df['sic'] <= 7218)) | ((7300 <= df['sic']) & (df['sic'] <= 7300)) | + ((7310 <= df['sic']) & (df['sic'] <= 7319)) | ((7320 <= df['sic']) & (df['sic'] <= 7329)) | + ((7330 <= df['sic']) & (df['sic'] <= 7339)) | ((7340 <= df['sic']) & (df['sic'] <= 7342)) | + ((7349 <= df['sic']) & (df['sic'] <= 7349)) | ((7350 <= df['sic']) & (df['sic'] <= 7351)) | + ((7352 <= df['sic']) & (df['sic'] <= 7352)) | ((7353 <= df['sic']) & (df['sic'] <= 7353)) | + ((7359 <= df['sic']) & (df['sic'] <= 7359)) | ((7360 <= df['sic']) & (df['sic'] <= 7369)) | + ((7374 <= df['sic']) & (df['sic'] <= 7374)) | ((7376 <= df['sic']) & (df['sic'] <= 7376)) | + ((7377 <= df['sic']) & (df['sic'] <= 7377)) | ((7378 <= df['sic']) & (df['sic'] <= 7378)) | + ((7379 <= df['sic']) & (df['sic'] <= 7379)) | ((7380 <= df['sic']) & (df['sic'] <= 7380)) | + ((7381 <= df['sic']) & (df['sic'] <= 7382)) | ((7383 <= df['sic']) & (df['sic'] <= 7383)) | + ((7384 <= df['sic']) & (df['sic'] <= 7384)) | ((7385 <= df['sic']) & (df['sic'] <= 7385)) | + ((7389 <= df['sic']) & (df['sic'] <= 7390)) | ((7391 <= df['sic']) & (df['sic'] <= 7391)) | + ((7392 <= df['sic']) & (df['sic'] <= 7392)) | ((7393 <= df['sic']) & (df['sic'] <= 7393)) | + ((7394 <= df['sic']) & (df['sic'] <= 7394)) | ((7396 <= df['sic']) & (df['sic'] <= 7396)) | + ((7397 <= df['sic']) & (df['sic'] <= 7397)) | ((7399 <= df['sic']) & (df['sic'] <= 7399)) | + ((7519 <= df['sic']) & (df['sic'] <= 7519)) | ((8700 <= df['sic']) & (df['sic'] <= 8700)) | + ((8710 <= df['sic']) & (df['sic'] <= 8713)) | ((8720 <= df['sic']) & (df['sic'] <= 8721)) | + ((8730 <= df['sic']) & (df['sic'] <= 8734)) | ((8740 <= df['sic']) & (df['sic'] <= 8748)) | + ((8900 <= df['sic']) & (df['sic'] <= 8910)) | ((8911 <= df['sic']) & (df['sic'] <= 8911)) | + ((8920 <= df['sic']) & (df['sic'] <= 8999)) | ((4220 <= df['sic']) & (df['sic'] <= 4229)), + ((3570 <= df['sic']) & (df['sic'] <= 3579)) | ((3680 <= df['sic']) & (df['sic'] <= 3680)) | + ((3681 <= df['sic']) & (df['sic'] <= 3681)) | ((3682 <= df['sic']) & (df['sic'] <= 3682)) | + ((3683 <= df['sic']) & (df['sic'] <= 3683)) | ((3684 <= df['sic']) & (df['sic'] <= 3684)) | + ((3685 <= df['sic']) & (df['sic'] <= 3685)) | ((3686 <= df['sic']) & (df['sic'] <= 3686)) | + ((3687 <= df['sic']) & (df['sic'] <= 3687)) | ((3688 <= df['sic']) & (df['sic'] <= 3688)) | + ((3689 <= df['sic']) & (df['sic'] <= 3689)) | ((3695 <= df['sic']) & (df['sic'] <= 3695)), + ((7370 <= df['sic']) & (df['sic'] <= 7372)) | ((7375 <= df['sic']) & (df['sic'] <= 7375)) | + ((7373 <= df['sic']) & (df['sic'] <= 7373)), + ((3622 <= df['sic']) & (df['sic'] <= 3622)) | ((3661 <= df['sic']) & (df['sic'] <= 3661)) | + ((3662 <= df['sic']) & (df['sic'] <= 3662)) | ((3663 <= df['sic']) & (df['sic'] <= 3663)) | + ((3664 <= df['sic']) & (df['sic'] <= 3664)) | ((3665 <= df['sic']) & (df['sic'] <= 3665)) | + ((3666 <= df['sic']) & (df['sic'] <= 3666)) | ((3669 <= df['sic']) & (df['sic'] <= 3669)) | + ((3670 <= df['sic']) & (df['sic'] <= 3679)) | ((3810 <= df['sic']) & (df['sic'] <= 3810)) | + ((3812 <= df['sic']) & (df['sic'] <= 3812)), + ((3811 <= df['sic']) & (df['sic'] <= 3811)) | ((3820 <= df['sic']) & (df['sic'] <= 3820)) | + ((3821 <= df['sic']) & (df['sic'] <= 3821)) | ((3822 <= df['sic']) & (df['sic'] <= 3822)) | + ((3823 <= df['sic']) & (df['sic'] <= 3823)) | ((3824 <= df['sic']) & (df['sic'] <= 3824)) | + ((3825 <= df['sic']) & (df['sic'] <= 3825)) | ((3826 <= df['sic']) & (df['sic'] <= 3826)) | + ((3827 <= df['sic']) & (df['sic'] <= 3827)) | ((3829 <= df['sic']) & (df['sic'] <= 3829)) | + ((3830 <= df['sic']) & (df['sic'] <= 3839)), + ((2520 <= df['sic']) & (df['sic'] <= 2549)) | ((2600 <= df['sic']) & (df['sic'] <= 2639)) | + ((2670 <= df['sic']) & (df['sic'] <= 2699)) | ((2760 <= df['sic']) & (df['sic'] <= 2761)) | + ((3950 <= df['sic']) & (df['sic'] <= 3955)), + ((2440 <= df['sic']) & (df['sic'] <= 2449)) | ((2640 <= df['sic']) & (df['sic'] <= 2659)) | + ((3220 <= df['sic']) & (df['sic'] <= 3221)) | ((3410 <= df['sic']) & (df['sic'] <= 3412)), + ((4000 <= df['sic']) & (df['sic'] <= 4013)) | ((4040 <= df['sic']) & (df['sic'] <= 4049)) | + ((4100 <= df['sic']) & (df['sic'] <= 4100)) | ((4110 <= df['sic']) & (df['sic'] <= 4119)) | + ((4120 <= df['sic']) & (df['sic'] <= 4121)) | ((4130 <= df['sic']) & (df['sic'] <= 4131)) | + ((4140 <= df['sic']) & (df['sic'] <= 4142)) | ((4150 <= df['sic']) & (df['sic'] <= 4151)) | + ((4170 <= df['sic']) & (df['sic'] <= 4173)) | ((4190 <= df['sic']) & (df['sic'] <= 4199)) | + ((4200 <= df['sic']) & (df['sic'] <= 4200)) | ((4210 <= df['sic']) & (df['sic'] <= 4219)) | + ((4230 <= df['sic']) & (df['sic'] <= 4231)) | ((4240 <= df['sic']) & (df['sic'] <= 4249)) | + ((4400 <= df['sic']) & (df['sic'] <= 4499)) | ((4500 <= df['sic']) & (df['sic'] <= 4599)) | + ((4600 <= df['sic']) & (df['sic'] <= 4699)) | ((4700 <= df['sic']) & (df['sic'] <= 4700)) | + ((4710 <= df['sic']) & (df['sic'] <= 4712)) | ((4720 <= df['sic']) & (df['sic'] <= 4729)) | + ((4730 <= df['sic']) & (df['sic'] <= 4739)) | ((4740 <= df['sic']) & (df['sic'] <= 4749)) | + ((4780 <= df['sic']) & (df['sic'] <= 4780)) | ((4782 <= df['sic']) & (df['sic'] <= 4782)) | + ((4783 <= df['sic']) & (df['sic'] <= 4783)) | ((4784 <= df['sic']) & (df['sic'] <= 4784)) | + ((4785 <= df['sic']) & (df['sic'] <= 4785)) | ((4789 <= df['sic']) & (df['sic'] <= 4789)), + ((5000 <= df['sic']) & (df['sic'] <= 5000)) | ((5010 <= df['sic']) & (df['sic'] <= 5015)) | + ((5020 <= df['sic']) & (df['sic'] <= 5023)) | ((5030 <= df['sic']) & (df['sic'] <= 5039)) | + ((5040 <= df['sic']) & (df['sic'] <= 5042)) | ((5043 <= df['sic']) & (df['sic'] <= 5043)) | + ((5044 <= df['sic']) & (df['sic'] <= 5044)) | ((5045 <= df['sic']) & (df['sic'] <= 5045)) | + ((5046 <= df['sic']) & (df['sic'] <= 5046)) | ((5047 <= df['sic']) & (df['sic'] <= 5047)) | + ((5048 <= df['sic']) & (df['sic'] <= 5048)) | ((5049 <= df['sic']) & (df['sic'] <= 5049)) | + ((5050 <= df['sic']) & (df['sic'] <= 5059)) | ((5060 <= df['sic']) & (df['sic'] <= 5060)) | + ((5063 <= df['sic']) & (df['sic'] <= 5063)) | ((5064 <= df['sic']) & (df['sic'] <= 5064)) | + ((5065 <= df['sic']) & (df['sic'] <= 5065)) | ((5070 <= df['sic']) & (df['sic'] <= 5078)) | + ((5080 <= df['sic']) & (df['sic'] <= 5080)) | ((5081 <= df['sic']) & (df['sic'] <= 5081)) | + ((5082 <= df['sic']) & (df['sic'] <= 5082)) | ((5083 <= df['sic']) & (df['sic'] <= 5083)) | + ((5084 <= df['sic']) & (df['sic'] <= 5084)) | ((5085 <= df['sic']) & (df['sic'] <= 5085)) | + ((5086 <= df['sic']) & (df['sic'] <= 5087)) | ((5088 <= df['sic']) & (df['sic'] <= 5088)) | + ((5090 <= df['sic']) & (df['sic'] <= 5090)) | ((5091 <= df['sic']) & (df['sic'] <= 5092)) | + ((5093 <= df['sic']) & (df['sic'] <= 5093)) | ((5094 <= df['sic']) & (df['sic'] <= 5094)) | + ((5099 <= df['sic']) & (df['sic'] <= 5099)) | ((5100 <= df['sic']) & (df['sic'] <= 5100)) | + ((5110 <= df['sic']) & (df['sic'] <= 5113)) | ((5120 <= df['sic']) & (df['sic'] <= 5122)) | + ((5130 <= df['sic']) & (df['sic'] <= 5139)) | ((5140 <= df['sic']) & (df['sic'] <= 5149)) | + ((5150 <= df['sic']) & (df['sic'] <= 5159)) | ((5160 <= df['sic']) & (df['sic'] <= 5169)) | + ((5170 <= df['sic']) & (df['sic'] <= 5172)) | ((5180 <= df['sic']) & (df['sic'] <= 5182)) | + ((5190 <= df['sic']) & (df['sic'] <= 5199)), + ((5200 <= df['sic']) & (df['sic'] <= 5200)) | ((5210 <= df['sic']) & (df['sic'] <= 5219)) | + ((5220 <= df['sic']) & (df['sic'] <= 5229)) | ((5230 <= df['sic']) & (df['sic'] <= 5231)) | + ((5250 <= df['sic']) & (df['sic'] <= 5251)) | ((5260 <= df['sic']) & (df['sic'] <= 5261)) | + ((5270 <= df['sic']) & (df['sic'] <= 5271)) | ((5300 <= df['sic']) & (df['sic'] <= 5300)) | + ((5310 <= df['sic']) & (df['sic'] <= 5311)) | ((5320 <= df['sic']) & (df['sic'] <= 5320)) | + ((5330 <= df['sic']) & (df['sic'] <= 5331)) | ((5334 <= df['sic']) & (df['sic'] <= 5334)) | + ((5340 <= df['sic']) & (df['sic'] <= 5349)) | ((5390 <= df['sic']) & (df['sic'] <= 5399)) | + ((5400 <= df['sic']) & (df['sic'] <= 5400)) | ((5410 <= df['sic']) & (df['sic'] <= 5411)) | + ((5412 <= df['sic']) & (df['sic'] <= 5412)) | ((5420 <= df['sic']) & (df['sic'] <= 5429)) | + ((5430 <= df['sic']) & (df['sic'] <= 5439)) | ((5440 <= df['sic']) & (df['sic'] <= 5449)) | + ((5450 <= df['sic']) & (df['sic'] <= 5459)) | ((5460 <= df['sic']) & (df['sic'] <= 5469)) | + ((5490 <= df['sic']) & (df['sic'] <= 5499)) | ((5500 <= df['sic']) & (df['sic'] <= 5500)) | + ((5510 <= df['sic']) & (df['sic'] <= 5529)) | ((5530 <= df['sic']) & (df['sic'] <= 5539)) | + ((5540 <= df['sic']) & (df['sic'] <= 5549)) | ((5550 <= df['sic']) & (df['sic'] <= 5559)) | + ((5560 <= df['sic']) & (df['sic'] <= 5569)) | ((5570 <= df['sic']) & (df['sic'] <= 5579)) | + ((5590 <= df['sic']) & (df['sic'] <= 5599)) | ((5600 <= df['sic']) & (df['sic'] <= 5699)) | + ((5700 <= df['sic']) & (df['sic'] <= 5700)) | ((5710 <= df['sic']) & (df['sic'] <= 5719)) | + ((5720 <= df['sic']) & (df['sic'] <= 5722)) | ((5730 <= df['sic']) & (df['sic'] <= 5733)) | + ((5734 <= df['sic']) & (df['sic'] <= 5734)) | ((5735 <= df['sic']) & (df['sic'] <= 5735)) | + ((5736 <= df['sic']) & (df['sic'] <= 5736)) | ((5750 <= df['sic']) & (df['sic'] <= 5799)) | + ((5900 <= df['sic']) & (df['sic'] <= 5900)) | ((5910 <= df['sic']) & (df['sic'] <= 5912)) | + ((5920 <= df['sic']) & (df['sic'] <= 5929)) | ((5930 <= df['sic']) & (df['sic'] <= 5932)) | + ((5940 <= df['sic']) & (df['sic'] <= 5940)) | ((5941 <= df['sic']) & (df['sic'] <= 5941)) | + ((5942 <= df['sic']) & (df['sic'] <= 5942)) | ((5943 <= df['sic']) & (df['sic'] <= 5943)) | + ((5944 <= df['sic']) & (df['sic'] <= 5944)) | ((5945 <= df['sic']) & (df['sic'] <= 5945)) | + ((5946 <= df['sic']) & (df['sic'] <= 5946)) | ((5947 <= df['sic']) & (df['sic'] <= 5947)) | + ((5948 <= df['sic']) & (df['sic'] <= 5948)) | ((5949 <= df['sic']) & (df['sic'] <= 5949)) | + ((5950 <= df['sic']) & (df['sic'] <= 5959)) | ((5960 <= df['sic']) & (df['sic'] <= 5969)) | + ((5970 <= df['sic']) & (df['sic'] <= 5979)) | ((5980 <= df['sic']) & (df['sic'] <= 5989)) | + ((5990 <= df['sic']) & (df['sic'] <= 5990)) | ((5992 <= df['sic']) & (df['sic'] <= 5992)) | + ((5993 <= df['sic']) & (df['sic'] <= 5993)) | ((5994 <= df['sic']) & (df['sic'] <= 5994)) | + ((5995 <= df['sic']) & (df['sic'] <= 5995)) | ((5999 <= df['sic']) & (df['sic'] <= 5999)), + ((5800 <= df['sic']) & (df['sic'] <= 5819)) | ((5820 <= df['sic']) & (df['sic'] <= 5829)) | + ((5890 <= df['sic']) & (df['sic'] <= 5899)) | ((7000 <= df['sic']) & (df['sic'] <= 7000)) | + ((7010 <= df['sic']) & (df['sic'] <= 7019)) | ((7040 <= df['sic']) & (df['sic'] <= 7049)) | + ((7213 <= df['sic']) & (df['sic'] <= 7213)), + ((6000 <= df['sic']) & (df['sic'] <= 6000)) | ((6010 <= df['sic']) & (df['sic'] <= 6019)) | + ((6020 <= df['sic']) & (df['sic'] <= 6020)) | ((6021 <= df['sic']) & (df['sic'] <= 6021)) | + ((6022 <= df['sic']) & (df['sic'] <= 6022)) | ((6023 <= df['sic']) & (df['sic'] <= 6024)) | + ((6025 <= df['sic']) & (df['sic'] <= 6025)) | ((6026 <= df['sic']) & (df['sic'] <= 6026)) | + ((6027 <= df['sic']) & (df['sic'] <= 6027)) | ((6028 <= df['sic']) & (df['sic'] <= 6029)) | + ((6030 <= df['sic']) & (df['sic'] <= 6036)) | ((6040 <= df['sic']) & (df['sic'] <= 6059)) | + ((6060 <= df['sic']) & (df['sic'] <= 6062)) | ((6080 <= df['sic']) & (df['sic'] <= 6082)) | + ((6090 <= df['sic']) & (df['sic'] <= 6099)) | ((6100 <= df['sic']) & (df['sic'] <= 6100)) | + ((6110 <= df['sic']) & (df['sic'] <= 6111)) | ((6112 <= df['sic']) & (df['sic'] <= 6113)) | + ((6120 <= df['sic']) & (df['sic'] <= 6129)) | ((6130 <= df['sic']) & (df['sic'] <= 6139)) | + ((6140 <= df['sic']) & (df['sic'] <= 6149)) | ((6150 <= df['sic']) & (df['sic'] <= 6159)) | + ((6160 <= df['sic']) & (df['sic'] <= 6169)) | ((6170 <= df['sic']) & (df['sic'] <= 6179)) | + ((6190 <= df['sic']) & (df['sic'] <= 6199)), + ((6300 <= df['sic']) & (df['sic'] <= 6300)) | ((6310 <= df['sic']) & (df['sic'] <= 6319)) | + ((6320 <= df['sic']) & (df['sic'] <= 6329)) | ((6330 <= df['sic']) & (df['sic'] <= 6331)) | + ((6350 <= df['sic']) & (df['sic'] <= 6351)) | ((6360 <= df['sic']) & (df['sic'] <= 6361)) | + ((6370 <= df['sic']) & (df['sic'] <= 6379)) | ((6390 <= df['sic']) & (df['sic'] <= 6399)) | + ((6400 <= df['sic']) & (df['sic'] <= 6411)), + ((6500 <= df['sic']) & (df['sic'] <= 6500)) | ((6510 <= df['sic']) & (df['sic'] <= 6510)) | + ((6512 <= df['sic']) & (df['sic'] <= 6512)) | ((6513 <= df['sic']) & (df['sic'] <= 6513)) | + ((6514 <= df['sic']) & (df['sic'] <= 6514)) | ((6515 <= df['sic']) & (df['sic'] <= 6515)) | + ((6517 <= df['sic']) & (df['sic'] <= 6519)) | ((6520 <= df['sic']) & (df['sic'] <= 6529)) | + ((6530 <= df['sic']) & (df['sic'] <= 6531)) | ((6532 <= df['sic']) & (df['sic'] <= 6532)) | + ((6540 <= df['sic']) & (df['sic'] <= 6541)) | ((6550 <= df['sic']) & (df['sic'] <= 6553)) | + ((6590 <= df['sic']) & (df['sic'] <= 6599)) | ((6610 <= df['sic']) & (df['sic'] <= 6611)), + ((6200 <= df['sic']) & (df['sic'] <= 6299)) | ((6700 <= df['sic']) & (df['sic'] <= 6700)) | + ((6710 <= df['sic']) & (df['sic'] <= 6719)) | ((6720 <= df['sic']) & (df['sic'] <= 6722)) | + ((6723 <= df['sic']) & (df['sic'] <= 6723)) | ((6724 <= df['sic']) & (df['sic'] <= 6724)) | + ((6725 <= df['sic']) & (df['sic'] <= 6725)) | ((6726 <= df['sic']) & (df['sic'] <= 6726)) | + ((6730 <= df['sic']) & (df['sic'] <= 6733)) | ((6740 <= df['sic']) & (df['sic'] <= 6779)) | + ((6790 <= df['sic']) & (df['sic'] <= 6791)) | ((6792 <= df['sic']) & (df['sic'] <= 6792)) | + ((6793 <= df['sic']) & (df['sic'] <= 6793)) | ((6794 <= df['sic']) & (df['sic'] <= 6794)) | + ((6795 <= df['sic']) & (df['sic'] <= 6795)) | ((6798 <= df['sic']) & (df['sic'] <= 6798)) | + ((6799 <= df['sic']) & (df['sic'] <= 6799)), + ((4950 <= df['sic']) & (df['sic'] <= 4959)) | ((4960 <= df['sic']) & (df['sic'] <= 4961)) | + ((4970 <= df['sic']) & (df['sic'] <= 4971)) | ((4990 <= df['sic']) & (df['sic'] <= 4991))] + choicelist = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] + return np.select(condlist, choicelist, default=np.nan) + + +def fillna_atq(df_q, df_a): + # fina columns are na in df_q and exist in df_a + df_q_na_list = df_q.columns[df_q.isna().any()].tolist() + df_a_columns_list = df_a.columns.values.tolist() + list_temp = list(set(df_q_na_list) & set(df_a_columns_list)) + # remove mom columns, mom chars are same in annual and quarterly + na_columns_list = [] + for i in list_temp: + if re.match(r'mom.', i) is None: + na_columns_list.append(i) + # get annual columns from df_a + df_temp = df_a[na_columns_list].copy() + df_temp[['permno', 'date']] = df_a[['permno', 'date']].copy() + # rename annual columns in the form of 'chars_a' + for na_column in na_columns_list: + df_temp = df_temp.rename(columns={'%s' % na_column: '%s_a' % na_column}) + df_temp = df_temp.reset_index(drop=True) + # use annual chars to fill quarterly na + df_q = pd.merge(df_q, df_temp, how='left', on=['permno', 'date']) + for na_column in na_columns_list: + df_q['%s' % na_column] = np.where(df_q['%s' % na_column].isnull(), df_q['%s_a' % na_column], df_q['%s' % na_column]) + df_q = df_q.drop(['%s_a' % na_column], axis=1) + return df_q + + +def fillna_ind(df, method, ffi): + df_fill = pd.DataFrame() + na_columns_list = df.columns[df.isna().any()].tolist() + for na_column in na_columns_list: + if method == 'mean': + df_temp = df.groupby(['date', 'ffi%s' % ffi])['%s' % na_column].mean() + elif method == 'median': + df_temp = df.groupby(['date', 'ffi%s' % ffi])['%s' % na_column].median() + else: + None + df_fill = pd.concat([df_fill, df_temp], axis=1) + if method == 'mean': + df_fill = df_fill.rename(columns={'%s' % na_column: '%s_mean' % na_column}) + elif method == 'median': + df_fill = df_fill.rename(columns={'%s' % na_column: '%s_median' % na_column}) + else: + None + df_fill = df_fill.reset_index() + # reset multiple index to date and ffi code + df_fill['index'] = df_fill['index'].astype(str) + index_temp = df_fill['index'].str.split(',', expand=True) + index_temp.columns = ['date', 'ffi%s' % ffi] + index_temp['date'] = index_temp['date'].str.strip('(Timestamp(\' \')') + index_temp['ffi%s' % ffi] = index_temp['ffi%s' % ffi].str.strip(')') + df_fill[['date', 'ffi%s' % ffi]] = index_temp[['date', 'ffi%s' % ffi]] + df_fill = df_fill.drop(['index'], axis=1) + df_fill['date'] = pd.to_datetime(df_fill['date']) + df_fill['ffi49'] = df_fill['ffi49'].astype(int) + # fill na + df = pd.merge(df, df_fill, how='left', on=['date', 'ffi%s' % ffi]) + for na_column in na_columns_list: + if method == 'mean': + df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column]) + df = df.drop(['%s_mean' % na_column], axis=1) + elif method == 'median': + df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_median' % na_column]) + df = df.drop(['%s_median' % na_column], axis=1) + else: + None + return df + + +def fillna_all(df, method): + df_fill = pd.DataFrame() + na_columns_list = df.columns[df.isna().any()].tolist() + for na_column in na_columns_list: + if method == 'mean': + df_temp = df.groupby(['date'])['%s' % na_column].mean() + elif method == 'median': + df_temp = df.groupby(['date'])['%s' % na_column].median() + else: + None + df_fill = pd.concat([df_fill, df_temp], axis=1) + if method == 'mean': + df_fill = df_fill.rename(columns={'%s' % na_column: '%s_mean' % na_column}) + elif method == 'median': + df_fill = df_fill.rename(columns={'%s' % na_column: '%s_median' % na_column}) + else: + None + df_fill = df_fill.reset_index() + # reset multiple index to date and ffi code + df_fill['index'] = df_fill['index'].astype(str) + index_temp = df_fill['index'].str.split(',', expand=True) + index_temp.columns = ['date'] + index_temp['date'] = index_temp['date'].str.strip('(Timestamp(\' \')') + df_fill[['date']] = index_temp[['date']] + df_fill = df_fill.drop(['index'], axis=1) + df_fill['date'] = pd.to_datetime(df_fill['date']) + # fill na + df = pd.merge(df, df_fill, how='left', on='date') + for na_column in na_columns_list: + if method == 'mean': + df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column]) + df = df.drop(['%s_mean' % na_column], axis=1) + elif method == 'median': + df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_median' % na_column]) + df = df.drop(['%s_median' % na_column], axis=1) + else: + None + return df + + +def standardize(df): + # exclude the the information columns + col_names = df.columns.values.tolist() + list_to_remove = ['permno', 'date', 'date', 'datadate', 'gvkey', 'sic', 'count', 'exchcd', 'shrcd', 'ffi49', 'ret', + 'retadj', 'retx', 'lag_me'] + col_names = list(set(col_names).difference(set(list_to_remove))) + for col_name in tqdm(col_names): + print('processing %s' % col_name) + # count the non-missing number of factors, we only count non-missing values + unique_count = df.dropna(subset=['%s' % col_name]).groupby(['date'])['%s' % col_name].unique().apply(len) + unique_count = pd.DataFrame(unique_count).reset_index() + unique_count.columns = ['date', 'count'] + df = pd.merge(df, unique_count, how='left', on=['date']) + # ranking, and then standardize the data + df['%s_rank' % col_name] = df.groupby(['date'])['%s' % col_name].rank(method='dense') + df['rank_%s' % col_name] = (df['%s_rank' % col_name] - 1) / (df['count'] - 1) * 2 - 1 + df = df.drop(['%s_rank' % col_name, '%s' % col_name, 'count'], axis=1) + df = df.fillna(0) + return df diff --git a/char60/iclink.py b/char60/iclink.py new file mode 100755 index 0000000..c630697 --- /dev/null +++ b/char60/iclink.py @@ -0,0 +1,241 @@ +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +from pandasql import * +from fuzzywuzzy import fuzz + +# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/python-replications/linking-ibes-and-crsp-data-python/ +##################################### +# ICLINK: Link CRSP and IBES # +# June 2019 # +# Qingyi (Freda) Song Drechsler # +##################################### + +# This program replicates the SAS macro ICLINK +# to create a linking table between CRSP and IBES +# Output is a score reflecting the quality of the link +# Score = 0 (best link) to Score = 6 (worst link) +# +# More explanation on score system: +# - 0: BEST match: using (cusip, cusip dates and company names) +# or (exchange ticker, company names and 6-digit cusip) +# - 1: Cusips and cusip dates match but company names do not match +# - 2: Cusips and company names match but cusip dates do not match +# - 3: Cusips match but cusip dates and company names do not match +# - 4: tickers and 6-digit cusips match but company names do not match +# - 5: tickers and company names match but 6-digit cusips do not match +# - 6: tickers match but company names and 6-digit cusips do not match + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +######################### +# Step 1: Link by CUSIP # +######################### + +# 1.1 IBES: Get the list of IBES Tickers for US firms in IBES +_ibes1 = conn.raw_sql(""" + select ticker, cusip, cname, sdates from ibes.id + where usfirm=1 and cusip != '' + """) + +# Create first and last 'start dates' for a given cusip +# Use agg min and max to find the first and last date per group +# then rename to fdate and ldate respectively + +_ibes1_date = _ibes1.groupby(['ticker','cusip']).sdates.agg(['min', 'max'])\ +.reset_index().rename(columns={'min':'fdate', 'max':'ldate'}) + +# merge fdate ldate back to _ibes1 data +_ibes2 = pd.merge(_ibes1, _ibes1_date,how='left', on =['ticker','cusip']) +_ibes2 = _ibes2.sort_values(by=['ticker','cusip','sdates']) + +# keep only the most recent company name +# determined by having sdates = ldate +_ibes2 = _ibes2.loc[_ibes2.sdates == _ibes2.ldate].drop(['sdates'], axis=1) + +# 1.2 CRSP: Get all permno-ncusip combinations +_crsp1 = conn.raw_sql(""" + select permno, ncusip, comnam, namedt, nameenddt + from crsp.stocknames + where ncusip != '' + """) + +# first namedt +_crsp1_fnamedt = _crsp1.groupby(['permno','ncusip']).namedt.min().reset_index() + +# last nameenddt +_crsp1_lnameenddt = _crsp1.groupby(['permno','ncusip']).nameenddt.max().reset_index() + +# merge both +_crsp1_dtrange = pd.merge(_crsp1_fnamedt, _crsp1_lnameenddt, \ + on = ['permno','ncusip'], how='inner') + +# replace namedt and nameenddt with the version from the dtrange +_crsp1 = _crsp1.drop(['namedt'],axis=1).rename(columns={'nameenddt':'enddt'}) +_crsp2 = pd.merge(_crsp1, _crsp1_dtrange, on =['permno','ncusip'], how='inner') + +# keep only most recent company name +_crsp2 = _crsp2.loc[_crsp2.enddt ==_crsp2.nameenddt].drop(['enddt'], axis=1) + +# 1.3 Create CUSIP Link Table + +# Link by full cusip, company names and dates +_link1_1 = pd.merge(_ibes2, _crsp2, how='inner', left_on='cusip', right_on='ncusip')\ +.sort_values(['ticker','permno','ldate']) + +# Keep link with most recent company name +_link1_1_tmp = _link1_1.groupby(['ticker','permno']).ldate.max().reset_index() +_link1_2 = pd.merge(_link1_1, _link1_1_tmp, how='inner', on =['ticker', 'permno', 'ldate']) + + +# Calculate name matching ratio using FuzzyWuzzy + +# Note: fuzz ratio = 100 -> match perfectly +# fuzz ratio = 0 -> do not match at all + +# Comment: token_set_ratio is more flexible in matching the strings: +# fuzz.token_set_ratio('AMAZON.COM INC', 'AMAZON COM INC') +# returns value of 100 + +# fuzz.ratio('AMAZON.COM INC', 'AMAZON COM INC') +# returns value of 93 + +_link1_2['name_ratio'] = _link1_2.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1) + +# Note on parameters: +# The following parameters are chosen to mimic the SAS macro %iclink +# In %iclink, name_dist < 30 is assigned score = 0 +# where name_dist=30 is roughly 90% percentile in total distribution +# and higher name_dist means more different names. +# In name_ratio, I mimic this by choosing 10% percentile as cutoff to assign +# score = 0 + +# 10% percentile of the company name distance +name_ratio_p10 = _link1_2.name_ratio.quantile(0.10) + +# Function to assign score for companies matched by: +# full cusip and passing name_ratio +# or meeting date range requirement + +def score1(row): + if (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']) & (row['name_ratio'] >= name_ratio_p10): + score = 0 + elif (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']): + score = 1 + elif row['name_ratio'] >= name_ratio_p10: + score = 2 + else: + score = 3 + return score + +# assign size portfolio +_link1_2['score']=_link1_2.apply(score1, axis=1) +_link1_2 = _link1_2[['ticker','permno','cname','comnam','name_ratio','score']] +_link1_2 = _link1_2.drop_duplicates() + +########################## +# Step 2: Link by TICKER # +########################## + +# Find links for the remaining unmatched cases using Exchange Ticker + +# Identify remaining unmatched cases +_nomatch1 = pd.merge(_ibes2[['ticker']], _link1_2[['permno','ticker']], on='ticker', how='left') +_nomatch1 = _nomatch1.loc[_nomatch1.permno.isnull()].drop(['permno'], axis=1).drop_duplicates() + +# Add IBES identifying information + +ibesid = conn.raw_sql(""" select ticker, cname, oftic, sdates, cusip from ibes.id """) +ibesid = ibesid.loc[ibesid.oftic.notna()] + +_nomatch2 = pd.merge(_nomatch1, ibesid, how='inner', on=['ticker']) + +# Create first and last 'start dates' for Exchange Tickers +# Label date range variables and keep only most recent company name + +_nomatch3 = _nomatch2.groupby(['ticker', 'oftic']).sdates.agg(['min', 'max'])\ +.reset_index().rename(columns={'min':'fdate', 'max':'ldate'}) + +_nomatch3 = pd.merge(_nomatch2, _nomatch3, how='left', on=['ticker','oftic']) + +_nomatch3 = _nomatch3.loc[_nomatch3.sdates == _nomatch3.ldate] + +# Get entire list of CRSP stocks with Exchange Ticker information + +_crsp_n1 = conn.raw_sql(""" select ticker, comnam, permno, ncusip, namedt, nameenddt + from crsp.stocknames """) + +_crsp_n1 = _crsp_n1.loc[_crsp_n1.ticker.notna()].sort_values(by=['permno','ticker','namedt']) + +# Arrange effective dates for link by Exchange Ticker + +_crsp_n1_namedt = _crsp_n1.groupby(['permno','ticker']).namedt.min().reset_index().rename(columns={'min':'namedt'}) +_crsp_n1_nameenddt = _crsp_n1.groupby(['permno','ticker']).nameenddt.max().reset_index().rename(columns={'max':'nameenddt'}) + +_crsp_n1_dt = pd.merge(_crsp_n1_namedt, _crsp_n1_nameenddt, how = 'inner', on=['permno','ticker']) + +_crsp_n1 = _crsp_n1.rename(columns={'namedt': 'namedt_ind', 'nameenddt':'nameenddt_ind'}) + +_crsp_n2 = pd.merge(_crsp_n1, _crsp_n1_dt, how ='left', on = ['permno','ticker']) + +_crsp_n2 = _crsp_n2.rename(columns={'ticker':'crsp_ticker'}) +_crsp_n2 = _crsp_n2.loc[_crsp_n2.nameenddt_ind == _crsp_n2.nameenddt].drop(['namedt_ind', 'nameenddt_ind'], axis=1) + +# Merge remaining unmatched cases using Exchange Ticker +# Note: Use ticker date ranges as exchange tickers are reused overtime + +_link2_1 = pd.merge(_nomatch3, _crsp_n2, how='inner', left_on=['oftic'], right_on=['crsp_ticker']) +_link2_1 = _link2_1.loc[(_link2_1.ldate>=_link2_1.namedt) & (_link2_1.fdate<=_link2_1.nameenddt)] + + +# Score using company name using 6-digit CUSIP and company name spelling distance +_link2_1['name_ratio'] = _link2_1.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1) + +_link2_2 = _link2_1 +_link2_2['cusip6'] = _link2_2.apply(lambda x: x.cusip[:6], axis=1) +_link2_2['ncusip6'] = _link2_2.apply(lambda x: x.ncusip[:6], axis=1) + +# Score using company name using 6-digit CUSIP and company name spelling distance + +def score2(row): + if (row['cusip6']==row['ncusip6']) & (row['name_ratio'] >= name_ratio_p10): + score = 0 + elif (row['cusip6']==row['ncusip6']): + score = 4 + elif row['name_ratio'] >= name_ratio_p10: + score = 5 + else: + score = 6 + return score + +# assign size portfolio +_link2_2['score']=_link2_2.apply(score2, axis=1) + +# Some companies may have more than one TICKER-PERMNO link +# so re-sort and keep the case (PERMNO & Company name from CRSP) +# that gives the lowest score for each IBES TICKER + +_link2_2 = _link2_2[['ticker','permno','cname','comnam', 'name_ratio', 'score']].sort_values(by=['ticker','score']) +_link2_2_score = _link2_2.groupby(['ticker']).score.min().reset_index() + +_link2_3 = pd.merge(_link2_2, _link2_2_score, how='inner', on=['ticker', 'score']) +_link2_3 = _link2_3[['ticker','permno','cname','comnam','score']].drop_duplicates() + +##################################### +# Step 3: Finalize LInks and Scores # +##################################### +# Combine the output from both linking procedures. Store the output data for future usage + +iclink = _link1_2.append(_link2_3) + +# Storing iclink for other program usage +import pickle as pkl + +with open('iclink.pkl', 'wb') as f: + pkl.dump(iclink, f) \ No newline at end of file diff --git a/char60/ill.py b/char60/ill.py new file mode 100755 index 0000000..df1f871 --- /dev/null +++ b/char60/ill.py @@ -0,0 +1,158 @@ +# Fama & French 3 factors residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret, a.vol, a.prc + from crsp.dsf as a + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate residual # +###################### + + +def get_baspread(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + if temp['permno'].count() < 21: + pass + else: + index = temp.tail(1).index + X = pd.DataFrame() + X[['vol', 'prc', 'ret']] = temp[['vol', 'prc', 'ret']] + ill = (abs(X['ret']) / abs(X['prc'])*X['vol']).mean() + df.loc[index, 'ill'] = ill + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['ill']) # drop NA due to rolling +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'ill']] + +with open('ill.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/impute_rank_output_bchmk_60.py b/char60/impute_rank_output_bchmk_60.py new file mode 100755 index 0000000..dd7a242 --- /dev/null +++ b/char60/impute_rank_output_bchmk_60.py @@ -0,0 +1,164 @@ +import pandas as pd +import pickle as pkl +import numpy as np +from tqdm import tqdm +from functions import * + +#################### +# All Stocks # +#################### +with open('chars_q_raw.pkl', 'rb') as f: + chars_q = pkl.load(f) + +chars_q = chars_q.dropna(subset=['permno']) +chars_q[['permno', 'gvkey']] = chars_q[['permno', 'gvkey']].astype(int) +chars_q['jdate'] = pd.to_datetime(chars_q['jdate']) +chars_q = chars_q.drop_duplicates(['permno', 'jdate']) + +with open('chars_a_raw.pkl', 'rb') as f: + chars_a = pkl.load(f) + +chars_a = chars_a.dropna(subset=['permno']) +chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int) +chars_a['jdate'] = pd.to_datetime(chars_a['jdate']) +chars_a = chars_a.drop_duplicates(['permno', 'jdate']) + +# information list +obs_var_list = ['gvkey', 'permno', 'jdate', 'sic', 'ret', 'retx', 'retadj', 'exchcd', 'shrcd'] +# characteristics with quarterly and annual frequency at the same time +accounting_var_list = ['datadate', 'acc', 'bm', 'agr', 'alm', 'ato', 'cash', 'cashdebt', 'cfp', 'chcsho', 'chpm', + 'chtx', 'depr', 'ep', 'gma', 'grltnoa', 'lev', 'lgr', 'ni', 'noa', 'op', 'pctacc', 'pm', + 'rd_sale', 'rdm', 'rna', 'roa', 'roe', 'rsup', 'sgr', 'sp'] +a_var_list = ['a_'+i for i in accounting_var_list] +q_var_list = ['q_'+i for i in accounting_var_list] +# annual frequency only list +a_only_list = ['adm', 'bm_ia', 'herf', 'hire', 'me_ia'] +# quarterly frequency only list +q_only_list = ['abr', 'sue', 'cinvest', 'nincr', 'pscore', + # 'turn', 'dolvol' + ] +# monthly frequency only list +m_var_list = ['baspread', 'beta', 'ill', 'maxret', 'mom12m', 'mom1m', 'mom36m', 'mom60m', 'mom6m', 're', 'rvar_capm', + 'rvar_ff3', 'rvar_mean', 'seas1a', 'std_dolvol', 'std_turn', 'zerotrade', 'me', 'dy', + 'turn', 'dolvol' # need to rerun the accounting to put them in to char_a + ] + +df_a = chars_a[obs_var_list + accounting_var_list + a_only_list + m_var_list] +df_a.columns = obs_var_list + a_var_list + a_only_list + m_var_list +df_a = df_a.sort_values(obs_var_list) + +df_q = chars_q[obs_var_list + accounting_var_list + q_only_list] +df_q.columns = obs_var_list + q_var_list + q_only_list +# drop the same information columns for merging +df_q = df_q.drop(['sic', 'ret', 'retx', 'retadj', 'exchcd', 'shrcd'], axis=1) + +df = df_a.merge(df_q, how='left', on=['gvkey', 'jdate', 'permno']) + +# first element in accounting_var_list is datadate +for i in tqdm(accounting_var_list[1:]): + print('processing %s' % i) + a = 'a_'+i + q = 'q_'+i + t1 = 'tmp1_'+i + t2 = 'tmp2_'+i + t3 = 'tmp3_'+i + t4 = 'tmp4_'+i + t5 = 'tmp5_'+i + + # tmp1: if the annual variable is available + df[t1] = np.where(df[a].isna(), False, True) + # tmp2: if the quarterly variable is available + df[t2] = np.where(df[q].isna(), False, True) + # tmp3: both + df[t3] = df[t1] & df[t2] + # tmp4: latest one + df[t4] = np.where(df['q_datadate'] < df['a_datadate'], df[a], df[q]) + # available one + df[t5] = np.where(df[t1], df[a], df[q]) + # final + df[i] = np.where(df[t3], df[t4], df[t5]) + df = df.drop([a, q, t1, t2, t3, t4, t5], axis=1) + +# drop the datadate of different frequency +df = df.drop(['a_datadate', 'q_datadate'], axis=1) + +# drop optional variables, you can adjust it by your selection +df = df.drop(['ret', 'retx'], axis=1) +df = df.rename(columns={'retadj': 'ret'}) # retadj is return adjusted by dividend +df['ret'] = df.groupby(['permno'])['ret'].shift(-1) # we shift return in t period to t+1 for prediction +df['date'] = df.groupby(['permno'])['jdate'].shift(-1) # date is return date, jdate is predictor date +df = df.drop(['jdate'], axis=1) # now we only keep the date of return +df = df.dropna(subset=['ret']).reset_index(drop=True) + +# save raw data +with open('chars60_raw_no_impute.pkl', 'wb') as f: + pkl.dump(df, f, protocol=4) + +# impute missing values, you can choose different func form functions.py, such as ffi49/ffi10 +df_impute = df.copy() +df_impute['sic'] = df_impute['sic'].astype(int) +df_impute['date'] = pd.to_datetime(df_impute['date']) + +df_impute['ffi49'] = ffi49(df_impute) +df_impute['ffi49'] = df_impute['ffi49'].fillna(49) # we treat na in ffi49 as 'other' +df_impute['ffi49'] = df_impute['ffi49'].astype(int) + +# there are two ways to impute: industrial median or mean +df_impute = fillna_ind(df_impute, method='median', ffi=49) + +df_impute = fillna_all(df_impute, method='median') +df_impute['re'] = df_impute['re'].fillna(0) # re use IBES database, there are lots of missing data + +df_impute['year'] = df_impute['date'].dt.year +df_impute = df_impute[df_impute['year'] >= 1972] +df_impute = df_impute.drop(['year'], axis=1) + +with open('chars60_raw_imputed.pkl', 'wb') as f: + pkl.dump(df_impute, f, protocol=4) + +# standardize raw data +df_rank = df.copy() +df_rank['lag_me'] = df_rank['me'] +df_rank = standardize(df_rank) +df_rank['year'] = df_rank['date'].dt.year +df_rank = df_rank[df_rank['year'] >= 1972] +df_rank = df_rank.drop(['year'], axis=1) +df_rank['log_me'] = np.log(df_rank['lag_me']) + +with open('chars60_rank_no_impute.pkl', 'wb') as f: + pkl.dump(df_rank, f, protocol=4) + +# standardize imputed data +df_rank = df_impute.copy() +df_rank['lag_me'] = df_rank['me'] +df_rank = standardize(df_rank) +df_rank['year'] = df_rank['date'].dt.year +df_rank = df_rank[df_rank['year'] >= 1972] +df_rank = df_rank.drop(['year'], axis=1) +df_rank['log_me'] = np.log(df_rank['lag_me']) + +with open('chars60_rank_imputed.pkl', 'wb') as f: + pkl.dump(df_rank, f, protocol=4) + + +#################### +# SP1500 # +#################### +with open('/home/jianxinma/chars/data/sp1500_impute_benchmark.pkl', 'rb') as f: + sp1500_index = pkl.load(f) + +sp1500_index = sp1500_index[['gvkey', 'date']] + +sp1500_impute = pd.merge(sp1500_index, df_impute, how='left', on=['gvkey', 'date']) + +# for test +# test = sp1500_rank.groupby(['jdate'])['gvkey'].nunique() + +with open('sp1500_impute_60.pkl', 'wb') as f: + pkl.dump(sp1500_impute, f, protocol=4) + +# standardize characteristics +sp1500_rank = pd.merge(sp1500_index, df_rank, how='left', on=['gvkey', 'date']) + +with open('sp1500_rank_60.pkl', 'wb') as f: + pkl.dump(sp1500_rank, f, protocol=4) diff --git a/char60/maxret_d.py b/char60/maxret_d.py new file mode 100755 index 0000000..69d208f --- /dev/null +++ b/char60/maxret_d.py @@ -0,0 +1,158 @@ +# Fama & French 3 factors residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret + from crsp.dsf as a + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate residual # +###################### + + +def get_baspread(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + if temp['permno'].count() < 21: + pass + else: + index = temp.tail(1).index + X = pd.DataFrame() + X[['ret']] = temp[['ret']] + maxret = X['ret'].max() + df.loc[index, 'maxret'] = maxret + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['maxret']) # drop NA due to rolling +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'maxret']] + +with open('maxret.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/merge_chars_60.py b/char60/merge_chars_60.py new file mode 100755 index 0000000..763e9e6 --- /dev/null +++ b/char60/merge_chars_60.py @@ -0,0 +1,294 @@ +# Since some firms only have annual recording before 80s, we need to use annual data as merging benchmark in case +# there are some recordings are missing + +import pandas as pd +import pickle as pkl +from pandas.tseries.offsets import * + +with open('chars_a_60.pkl', 'rb') as f: + chars_a = pkl.load(f) + +chars_a = chars_a.dropna(subset=['permno']) +chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int) +chars_a['jdate'] = pd.to_datetime(chars_a['jdate']) +chars_a = chars_a.drop_duplicates(['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/beta.pkl', 'rb') as f: + beta = pkl.load(f) + +beta['permno'] = beta['permno'].astype(int) +beta['jdate'] = pd.to_datetime(beta['date']) + MonthEnd(0) +beta = beta[['permno', 'jdate', 'beta']] +beta = beta.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, beta, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/rvar_capm.pkl', 'rb') as f: + rvar_capm = pkl.load(f) + +rvar_capm['permno'] = rvar_capm['permno'].astype(int) +rvar_capm['jdate'] = pd.to_datetime(rvar_capm['date']) + MonthEnd(0) +rvar_capm = rvar_capm[['permno', 'jdate', 'rvar_capm']] +rvar_capm = rvar_capm.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, rvar_capm, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/rvar_mean.pkl', 'rb') as f: + rvar_mean = pkl.load(f) + +rvar_mean['permno'] = rvar_mean['permno'].astype(int) +rvar_mean['jdate'] = pd.to_datetime(rvar_mean['date']) + MonthEnd(0) +rvar_mean = rvar_mean[['permno', 'jdate', 'rvar_mean']] +rvar_mean = rvar_mean.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, rvar_mean, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/rvar_ff3.pkl', 'rb') as f: + rvar_ff3 = pkl.load(f) + +rvar_ff3['permno'] = rvar_ff3['permno'].astype(int) +rvar_ff3['jdate'] = pd.to_datetime(rvar_ff3['date']) + MonthEnd(0) +rvar_ff3 = rvar_ff3[['permno', 'jdate', 'rvar_ff3']] +rvar_ff3 = rvar_ff3.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, rvar_ff3, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/sue.pkl', 'rb') as f: + sue = pkl.load(f) + +sue['permno'] = sue['permno'].astype(int) +sue['jdate'] = pd.to_datetime(sue['date']) + MonthEnd(0) +sue = sue[['permno', 'jdate', 'sue']] +sue = sue.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, sue, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/re.pkl', 'rb') as f: + re = pkl.load(f) + +re['permno'] = re['permno'].astype(int) +re['jdate'] = pd.to_datetime(re['date']) + MonthEnd(0) +re = re[['permno', 'jdate', 're']] +re = re.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, re, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/abr.pkl', 'rb') as f: + abr = pkl.load(f) + +abr['permno'] = abr['permno'].astype(int) +abr['jdate'] = pd.to_datetime(abr['date']) + MonthEnd(0) +abr = abr[['permno', 'jdate', 'abr']] +abr = abr.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, abr, how='left', on=['permno', 'jdate']) + +with open('baspread.pkl', 'rb') as f: + baspread = pkl.load(f) + +baspread['permno'] = baspread['permno'].astype(int) +baspread['jdate'] = pd.to_datetime(baspread['date']) + MonthEnd(0) +baspread = baspread[['permno', 'jdate', 'baspread']] +baspread = baspread.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, baspread, how='left', on=['permno', 'jdate']) + +with open('maxret.pkl', 'rb') as f: + maxret = pkl.load(f) + +maxret['permno'] = maxret['permno'].astype(int) +maxret['jdate'] = pd.to_datetime(maxret['date']) + MonthEnd(0) +maxret = maxret[['permno', 'jdate', 'maxret']] +maxret = maxret.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, maxret, how='left', on=['permno', 'jdate']) + +with open('std_dolvol.pkl', 'rb') as f: + std_dolvol = pkl.load(f) + +std_dolvol['permno'] = std_dolvol['permno'].astype(int) +std_dolvol['jdate'] = pd.to_datetime(std_dolvol['date']) + MonthEnd(0) +std_dolvol = std_dolvol[['permno', 'jdate', 'std_dolvol']] +std_dolvol = std_dolvol.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, std_dolvol, how='left', on=['permno', 'jdate']) + +with open('ill.pkl', 'rb') as f: + ill = pkl.load(f) + +ill['permno'] = ill['permno'].astype(int) +ill['jdate'] = pd.to_datetime(ill['date']) + MonthEnd(0) +ill = ill[['permno', 'jdate', 'ill']] +ill = ill.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, ill, how='left', on=['permno', 'jdate']) + +with open('std_turn.pkl', 'rb') as f: + std_turn = pkl.load(f) + +std_turn['permno'] = std_turn['permno'].astype(int) +std_turn['jdate'] = pd.to_datetime(std_turn['date']) + MonthEnd(0) +std_turn = std_turn[['permno', 'jdate', 'std_turn']] +std_turn = std_turn.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, std_turn, how='left', on=['permno', 'jdate']) + +with open('zerotrade.pkl', 'rb') as f: + zerotrade = pkl.load(f) + +zerotrade['permno'] = zerotrade['permno'].astype(int) +zerotrade['jdate'] = pd.to_datetime(zerotrade['date']) + MonthEnd(0) +zerotrade = zerotrade[['permno', 'jdate', 'zerotrade']] +zerotrade = zerotrade.drop_duplicates(['permno', 'jdate']) + +chars_a = pd.merge(chars_a, zerotrade, how='left', on=['permno', 'jdate']) + +# save data +with open('chars_a_raw.pkl', 'wb') as f: + pkl.dump(chars_a, f, protocol=4) + +######################################################################################################################## +# In order to keep the naming tidy, we need to make another chars_q_raw, which is just a temporary dataframe # +######################################################################################################################## + +with open('chars_q_60.pkl', 'rb') as f: + chars_q = pkl.load(f) + +chars_q = chars_q.dropna(subset=['permno']) +chars_q[['permno', 'gvkey']] = chars_q[['permno', 'gvkey']].astype(int) +chars_q['jdate'] = pd.to_datetime(chars_q['jdate']) +chars_q = chars_q.drop_duplicates(['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/beta.pkl', 'rb') as f: + beta = pkl.load(f) + +beta['permno'] = beta['permno'].astype(int) +beta['jdate'] = pd.to_datetime(beta['date']) + MonthEnd(0) +beta = beta[['permno', 'jdate', 'beta']] +beta = beta.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, beta, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/rvar_capm.pkl', 'rb') as f: + rvar_capm = pkl.load(f) + +rvar_capm['permno'] = rvar_capm['permno'].astype(int) +rvar_capm['jdate'] = pd.to_datetime(rvar_capm['date']) + MonthEnd(0) +rvar_capm = rvar_capm[['permno', 'jdate', 'rvar_capm']] +rvar_capm = rvar_capm.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, rvar_capm, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/rvar_mean.pkl', 'rb') as f: + rvar_mean = pkl.load(f) + +rvar_mean['permno'] = rvar_mean['permno'].astype(int) +rvar_mean['jdate'] = pd.to_datetime(rvar_mean['date']) + MonthEnd(0) +rvar_mean = rvar_mean[['permno', 'jdate', 'rvar_mean']] +rvar_mean = rvar_mean.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, rvar_mean, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/rvar_ff3.pkl', 'rb') as f: + rvar_ff3 = pkl.load(f) + +rvar_ff3['permno'] = rvar_ff3['permno'].astype(int) +rvar_ff3['jdate'] = pd.to_datetime(rvar_ff3['date']) + MonthEnd(0) +rvar_ff3 = rvar_ff3[['permno', 'jdate', 'rvar_ff3']] +rvar_ff3 = rvar_ff3.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, rvar_ff3, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/sue.pkl', 'rb') as f: + sue = pkl.load(f) + +sue['permno'] = sue['permno'].astype(int) +sue['jdate'] = pd.to_datetime(sue['date']) + MonthEnd(0) +sue = sue[['permno', 'jdate', 'sue']] +sue = sue.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, sue, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/re.pkl', 'rb') as f: + re = pkl.load(f) + +re['permno'] = re['permno'].astype(int) +re['jdate'] = pd.to_datetime(re['date']) + MonthEnd(0) +re = re[['permno', 'jdate', 're']] +re = re.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, re, how='left', on=['permno', 'jdate']) + +with open('/home/jianxinma/chars/data/abr.pkl', 'rb') as f: + abr = pkl.load(f) + +abr['permno'] = abr['permno'].astype(int) +abr['jdate'] = pd.to_datetime(abr['date']) + MonthEnd(0) +abr = abr[['permno', 'jdate', 'abr']] +abr = abr.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, abr, how='left', on=['permno', 'jdate']) + +with open('baspread.pkl', 'rb') as f: + baspread = pkl.load(f) + +baspread['permno'] = baspread['permno'].astype(int) +baspread['jdate'] = pd.to_datetime(baspread['date']) + MonthEnd(0) +baspread = baspread[['permno', 'jdate', 'baspread']] +baspread = baspread.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, baspread, how='left', on=['permno', 'jdate']) + +with open('maxret.pkl', 'rb') as f: + maxret = pkl.load(f) + +maxret['permno'] = maxret['permno'].astype(int) +maxret['jdate'] = pd.to_datetime(maxret['date']) + MonthEnd(0) +maxret = maxret[['permno', 'jdate', 'maxret']] +maxret = maxret.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, maxret, how='left', on=['permno', 'jdate']) + +with open('std_dolvol.pkl', 'rb') as f: + std_dolvol = pkl.load(f) + +std_dolvol['permno'] = std_dolvol['permno'].astype(int) +std_dolvol['jdate'] = pd.to_datetime(std_dolvol['date']) + MonthEnd(0) +std_dolvol = std_dolvol[['permno', 'jdate', 'std_dolvol']] +std_dolvol = std_dolvol.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, std_dolvol, how='left', on=['permno', 'jdate']) + +with open('ill.pkl', 'rb') as f: + ill = pkl.load(f) + +ill['permno'] = ill['permno'].astype(int) +ill['jdate'] = pd.to_datetime(ill['date']) + MonthEnd(0) +ill = ill[['permno', 'jdate', 'ill']] +ill = ill.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, ill, how='left', on=['permno', 'jdate']) + +with open('std_turn.pkl', 'rb') as f: + std_turn = pkl.load(f) + +std_turn['permno'] = std_turn['permno'].astype(int) +std_turn['jdate'] = pd.to_datetime(std_turn['date']) + MonthEnd(0) +std_turn = std_turn[['permno', 'jdate', 'std_turn']] +std_turn = std_turn.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, std_turn, how='left', on=['permno', 'jdate']) + +with open('zerotrade.pkl', 'rb') as f: + zerotrade = pkl.load(f) + +zerotrade['permno'] = zerotrade['permno'].astype(int) +zerotrade['jdate'] = pd.to_datetime(zerotrade['date']) + MonthEnd(0) +zerotrade = zerotrade[['permno', 'jdate', 'zerotrade']] +zerotrade = zerotrade.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, zerotrade, how='left', on=['permno', 'jdate']) + +# save data +with open('chars_q_raw.pkl', 'wb') as f: + pkl.dump(chars_q, f, protocol=4) \ No newline at end of file diff --git a/char60/pkl_to_csv.py b/char60/pkl_to_csv.py new file mode 100755 index 0000000..74cefea --- /dev/null +++ b/char60/pkl_to_csv.py @@ -0,0 +1,29 @@ +import pickle as pkl +import pandas as pd + +with open('/Users/eric/Downloads/chars_rank_60.pkl', 'rb') as f: + chars = pkl.load(f) + +print(chars.columns.values) + +chars['jdate'] = pd.to_datetime(chars['jdate']) +chars['year'] = chars['jdate'].dt.year +chars_1970s = chars[chars['year'] < 1980] +chars_1980s = chars[(chars['year'] >= 1980) & (chars['year'] < 1990)] +chars_1990s = chars[(chars['year'] >= 1990) & (chars['year'] < 2000)] +chars_2000s = chars[(chars['year'] >= 1990) & (chars['year'] < 2010)] +chars_2010s = chars[(chars['year'] >= 2000) & (chars['year'] < 2020)] + +# raw +# chars_1970s.to_csv('chars60_raw_1970s.csv', index=0) +# chars_1980s.to_csv('chars60_raw_1980s.csv', index=0) +# chars_1990s.to_csv('chars60_raw_1990s.csv', index=0) +# chars_2000s.to_csv('chars60_raw_2000s.csv', index=0) +# chars_2010s.to_csv('chars60_raw_2010s.csv', index=0) + +# rank +chars_1970s.to_csv('chars60_rank_1970s.csv', index=0) +chars_1980s.to_csv('chars60_rank_1980s.csv', index=0) +chars_1990s.to_csv('chars60_rank_1990s.csv', index=0) +chars_2000s.to_csv('chars60_rank_2000s.csv', index=0) +chars_2010s.to_csv('chars60_rank_2010s.csv', index=0) \ No newline at end of file diff --git a/char60/re.py b/char60/re.py new file mode 100755 index 0000000..7dab02f --- /dev/null +++ b/char60/re.py @@ -0,0 +1,120 @@ +# Calculate HSZ Replicating Anomalies +# RE: Revisions in analysts’ earnings forecasts + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +from pandasql import * +import pickle as pkl + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +######################################################################### +# Merging IBES and CRSP by using ICLINK table. Merging last month price # +######################################################################### + +with open('iclink.pkl', 'rb')as f: + iclink = pkl.load(f) + +ibes = conn.raw_sql(""" + select + ticker, statpers, meanest, fpedats, anndats_act, curr_act, fpi, medest + from ibes.statsum_epsus + where + /* filtering IBES */ + statpers=0 + and CURCODE='USD' + and fpi in ('1','2')""") + +# filtering IBES +ibes = ibes[(ibes['medest'].notna()) & (ibes['fpedats'].notna())] +ibes = ibes[(ibes['curr_act']=='USD') | (ibes['curr_act'].isnull())] +ibes['statpers'] = pd.to_datetime(ibes['statpers']) +ibes['merge_date'] = ibes['statpers']+MonthEnd(0) + +crsp_msf = conn.raw_sql(""" + select permno, date, prc, cfacpr + from crsp.msf + """) + +crsp_msf['date'] = pd.to_datetime(crsp_msf['date']) +crsp_msf['date'] = crsp_msf['date']+MonthEnd(0) +crsp_msf['merge_date'] = crsp_msf['date']+MonthEnd(1) + +ibes_iclink = pd.merge(ibes, iclink, how='left', on='ticker') +ibes_crsp = pd.merge(ibes_iclink, crsp_msf, how='inner', on=['permno', 'merge_date']) +ibes_crsp.sort_values(by=['ticker', 'fpedats', 'statpers'], inplace=True) +ibes_crsp.reset_index(inplace=True, drop=True) + +############################### +# Merging last month forecast # +############################### +ibes_crsp['statpers_last_month'] = np.where((ibes_crsp['ticker'] == ibes_crsp['ticker'].shift(1)) & + (ibes_crsp['permno'] == ibes_crsp['permno'].shift(1)) & + (ibes_crsp['fpedats'] == ibes_crsp['fpedats'].shift(1)), + ibes_crsp['statpers'].shift(1).astype(str), np.nan) + +ibes_crsp['meanest_last_month'] = np.where((ibes_crsp['ticker'] == ibes_crsp['ticker'].shift(1)) & + (ibes_crsp['permno'] == ibes_crsp['permno'].shift(1)) & + (ibes_crsp['fpedats'] == ibes_crsp['fpedats'].shift(1)), + ibes_crsp['meanest'].shift(1), np.nan) + +ibes_crsp.sort_values(by=['ticker', 'permno', 'fpedats', 'statpers'], inplace=True) +ibes_crsp.reset_index(inplace=True, drop=True) + +########################### +# Drop empty "last month" # +# Calculate HXZ RE # +########################### + +ibes_crsp = ibes_crsp[ibes_crsp['statpers_last_month'].notna()] +ibes_crsp['prc_adj'] = ibes_crsp['prc']/ibes_crsp['cfacpr'] +ibes_crsp = ibes_crsp[ibes_crsp['prc_adj']>0] +ibes_crsp['monthly_revision'] = (ibes_crsp['meanest'] - ibes_crsp['meanest_last_month'])/ibes_crsp['prc_adj'] + +ibes_crsp['permno'] = ibes_crsp['permno'].astype(int) +ibes_crsp['permno'] = ibes_crsp['permno'].astype(str) +ibes_crsp['fpedats'] = ibes_crsp['fpedats'].astype(str) +ibes_crsp['permno_fpedats'] = ibes_crsp['permno'].str.cat(ibes_crsp['fpedats'], sep='-') + +ibes_crsp = ibes_crsp.drop_duplicates(['permno_fpedats', 'statpers']) +ibes_crsp['count'] = ibes_crsp.groupby('permno_fpedats').cumcount() + 1 + +######################## +# Calculate RE (CJL) # +######################## + +ibes_crsp['monthly_revision_l1'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(1) +ibes_crsp['monthly_revision_l2'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(2) +ibes_crsp['monthly_revision_l3'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(3) +ibes_crsp['monthly_revision_l4'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(4) +ibes_crsp['monthly_revision_l5'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(5) +ibes_crsp['monthly_revision_l6'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(6) + +condlist = [ibes_crsp['count']==4, + ibes_crsp['count']==5, + ibes_crsp['count']==6, + ibes_crsp['count']>=7] +choicelist = [(ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'])/3, + (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'])/4, + (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'] + ibes_crsp['monthly_revision_l5'])/5, + (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'] + ibes_crsp['monthly_revision_l5'] + ibes_crsp['monthly_revision_l6'])/6] +ibes_crsp['re'] = np.select(condlist, choicelist, default=np.nan) + +ibes_crsp = ibes_crsp[ibes_crsp['count']>=4] +ibes_crsp = ibes_crsp.sort_values(by=['ticker', 'statpers', 'fpedats']) +ibes_crsp = ibes_crsp.drop_duplicates(['ticker', 'statpers']) + +ibes_crsp = ibes_crsp[['ticker', 'statpers', 'fpedats', 'anndats_act', 'curr_act', 'permno', 're']] +ibes_crsp.rename(columns={'statpers': 'date'}, inplace=True) + +with open('re.pkl', 'wb') as f: + pkl.dump(ibes_crsp, f) \ No newline at end of file diff --git a/char60/rvar_capm.py b/char60/rvar_capm.py new file mode 100755 index 0000000..fa3a01c --- /dev/null +++ b/char60/rvar_capm.py @@ -0,0 +1,168 @@ +# CAPM residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf + from crsp.dsf as a + left join ff.factors_daily as b + on a.date=b.date + where a.date >= '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate residual # +###################### + + +def get_res_var(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + # if observations in last 3 months are less 21, we drop the rvar of this month + if temp['permno'].count() < 21: + pass + else: + rolling_window = temp['permno'].count() + index = temp.tail(1).index + X = pd.DataFrame() + X[['mktrf']] = temp[['mktrf']] + X['intercept'] = 1 + X = X[['intercept', 'mktrf']] + X = np.mat(X) + Y = np.mat(temp[['exret']]) + res = (np.identity(rolling_window) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + res_var = res.var(ddof=1) + df.loc[index, 'rvar'] = res_var + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_res_var, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling +crsp = crsp.rename(columns={'rvar': 'rvar_capm'}) +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'rvar_capm']] + +with open('rvar_capm.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/rvar_ff3.py b/char60/rvar_ff3.py new file mode 100755 index 0000000..36561a0 --- /dev/null +++ b/char60/rvar_ff3.py @@ -0,0 +1,201 @@ +# Fama & French 3 factors residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf, b.smb, b.hml + from crsp.dsf as a + left join ff.factors_daily as b + on a.date=b.date + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate the beta # +###################### +# function that get multiple beta +'''' +rolling_window = 60 # 60 trading days +crsp['beta_mktrf'] = np.nan +crsp['beta_smb'] = np.nan +crsp['beta_hml'] = np.nan + + +def get_beta(df): + """ + The original idea of calculate beta is using formula (X'MX)^(-1)X'MY, + where M = I - 1(1'1)^{-1}1, I is a identity matrix. + + """ + temp = crsp.loc[df.index] # extract the rolling sub dataframe from original dataframe + X = np.mat(temp[['mktrf', 'smb', 'hml']]) + Y = np.mat(temp[['exret']]) + ones = np.mat(np.ones(rolling_window)).T + M = np.identity(rolling_window) - ones.dot((ones.T.dot(ones)).I).dot(ones.T) + beta = (X.T.dot(M).dot(X)).I.dot((X.T.dot(M).dot(Y))) + crsp['beta_mktrf'].loc[df.index[-1:]] = beta[0] + crsp['beta_smb'].loc[df.index[-1:]] = beta[1] + crsp['beta_hml'].loc[df.index[-1:]] = beta[2] + return 0 # we do not need the rolling outcome since rolling cannot return different values in different columns + + +# calculate beta through rolling window +crsp_temp = crsp.groupby('permno').rolling(rolling_window).apply(get_beta, raw=False) +''' + +###################### +# Calculate residual # +###################### + + +def get_res_var(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + # if observations in last 3 months are less 21, we drop the rvar of this month + if temp['permno'].count() < 21: + pass + else: + rolling_window = temp['permno'].count() + index = temp.tail(1).index + X = pd.DataFrame() + X[['mktrf', 'smb', 'hml']] = temp[['mktrf', 'smb', 'hml']] + X['intercept'] = 1 + X = X[['intercept', 'mktrf', 'smb', 'hml']] + X = np.mat(X) + Y = np.mat(temp[['exret']]) + res = (np.identity(rolling_window) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + res_var = res.var(ddof=1) + df.loc[index, 'rvar'] = res_var + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_res_var, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling +crsp = crsp.rename(columns={'rvar': 'rvar_ff3'}) +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'rvar_ff3']] + +with open('rvar_ff3.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/rvar_mean.py b/char60/rvar_mean.py new file mode 100755 index 0000000..42297f4 --- /dev/null +++ b/char60/rvar_mean.py @@ -0,0 +1,150 @@ +# RVAR mean + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select permno, date, ret + from crsp.dsf + where date >= '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate variance # +###################### + + +def get_ret_var(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + # if observations in last 3 months are less 21, we drop the rvar of this month + if temp['permno'].count() < 21: + pass + else: + index = temp.tail(1).index + ret_var = temp['ret'].var() + df.loc[index, 'rvar'] = ret_var + return df + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_ret_var, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling +crsp = crsp.rename(columns={'rvar': 'rvar_mean'}) +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'rvar_mean']] + +with open('rvar_mean.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/std_dolvol.py b/char60/std_dolvol.py new file mode 100755 index 0000000..fc3c2ef --- /dev/null +++ b/char60/std_dolvol.py @@ -0,0 +1,158 @@ +# Fama & French 3 factors residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.vol, a.prc + from crsp.dsf as a + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate residual # +###################### + + +def get_baspread(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + if temp['permno'].count() < 21: + pass + else: + index = temp.tail(1).index + X = pd.DataFrame() + X[['prc', 'vol']] = temp[['prc', 'vol']] + std_dolvol = np.log(abs((X['vol']*X['prc']))).replace([np.inf, -np.inf], np.nan).std() + df.loc[index, 'std_dolvol'] = std_dolvol + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['std_dolvol']) # drop NA due to rolling +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'std_dolvol']] + +with open('std_dolvol.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/std_turn.py b/char60/std_turn.py new file mode 100755 index 0000000..c5d30ec --- /dev/null +++ b/char60/std_turn.py @@ -0,0 +1,158 @@ +# Fama & French 3 factors residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.vol, a.shrout + from crsp.dsf as a + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate residual # +###################### + + +def get_baspread(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + if temp['permno'].count() < 21: + pass + else: + index = temp.tail(1).index + X = pd.DataFrame() + X[['vol', 'shrout']] = temp[['vol', 'shrout']] + std_turn = (X['vol'] / X['shrout']).std() + df.loc[index, 'std_turn'] = std_turn + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['std_turn']) # drop NA due to rolling +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'std_turn']] + +with open('std_turn.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/char60/sue.py b/char60/sue.py new file mode 100755 index 0000000..8238cdb --- /dev/null +++ b/char60/sue.py @@ -0,0 +1,106 @@ +# Calculate HSZ Replicating Anomalies +# SUE: Standardized Unexpected Earnings (Earnings surprise) + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +from pandasql import * +import pickle as pkl + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +################### +# Compustat Block # +################### +comp = conn.raw_sql(""" + select gvkey, datadate, fyearq, fqtr, epspxq, ajexq + from comp.fundq + where indfmt = 'INDL' + and datafmt = 'STD' + and popsrc = 'D' + and consol = 'C' + and datadate >= '01/01/1959' + """) + +comp['datadate'] = pd.to_datetime(comp['datadate']) + +################### +# CCM Block # +################### +ccm = conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where linktype in ('LU', 'LC') + """) + +ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) +ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) +# if linkenddt is missing then set to today date +ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) + +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) + +# set link date bounds +ccm2 = ccm1[(ccm1['datadate']>=ccm1['linkdt']) & (ccm1['datadate']<=ccm1['linkenddt'])] +ccm2 = ccm2[['gvkey', 'permno', 'datadate', 'fyearq', 'fqtr', 'epspxq', 'ajexq']] + +# the time series of exspxq/ajexq +ccm2['eps'] = ccm2['epspxq']/ccm2['ajexq'] +ccm2.drop_duplicates(['permno', 'datadate'], inplace=True) + +# merge lag1 to lag9, then calculate stand deviation +ccm2 = ccm2[ccm2['eps'].notna()] +ccm2['count'] = ccm2.groupby('permno').cumcount() + 1 +ccm2.sort_values(by=['permno', 'datadate'], inplace=True) + +ccm2['e1'] = ccm2.groupby(['permno'])['eps'].shift(1) +ccm2['e2'] = ccm2.groupby(['permno'])['eps'].shift(2) +ccm2['e3'] = ccm2.groupby(['permno'])['eps'].shift(3) +ccm2['e4'] = ccm2.groupby(['permno'])['eps'].shift(4) +ccm2['e5'] = ccm2.groupby(['permno'])['eps'].shift(5) +ccm2['e6'] = ccm2.groupby(['permno'])['eps'].shift(6) +ccm2['e7'] = ccm2.groupby(['permno'])['eps'].shift(7) +ccm2['e8'] = ccm2.groupby(['permno'])['eps'].shift(8) + +condlist = [ccm2['count']<=6, + ccm2['count']==7, + ccm2['count']==8, + ccm2['count']>=9] +choicelist = [np.nan, + ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3']].std(axis=1), + ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3', 'e2']].std(axis=1), + ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3', 'e2', 'e1']].std(axis=1)] +ccm2['sue_std'] = np.select(condlist, choicelist, default=np.nan) + +ccm2['sue'] = (ccm2['eps'] - ccm2['e4'])/ccm2['sue_std'] + +# populate the quarterly sue to monthly +crsp_msf = conn.raw_sql(""" + select distinct date + from crsp.msf + where date >= '01/01/1959' + """) + +ccm2['datadate'] = pd.to_datetime(ccm2['datadate']) +ccm2['plus12m'] = ccm2['datadate'] + np.timedelta64(12, 'M') +ccm2['plus12m'] = ccm2['plus12m'] + MonthEnd(0) + +df = sqldf("""select a.*, b.date + from ccm2 a left join crsp_msf b + on a.datadate <= b.date + and a.plus12m >= b.date + order by a.permno, b.date, a.datadate desc;""", globals()) + +df = df.drop_duplicates(['permno', 'date']) +df['datadate'] = pd.to_datetime(df['datadate']) +df = df[['gvkey', 'permno', 'datadate', 'date', 'sue']] + +with open('sue.pkl', 'wb') as f: + pkl.dump(df, f) \ No newline at end of file diff --git a/char60/zerotrade.py b/char60/zerotrade.py new file mode 100755 index 0000000..8f10d4f --- /dev/null +++ b/char60/zerotrade.py @@ -0,0 +1,161 @@ +# Fama & French 3 factors residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.vol, a.shrout + from crsp.dsf as a + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate residual # +###################### + + +def get_baspread(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + if temp['permno'].count() < 21: + pass + else: + index = temp.tail(1).index + X = pd.DataFrame() + X[['vol', 'shrout']] = temp[['vol', 'shrout']] + X['countzero'] = np.where(X['vol'] == 0, 1, 0) + X['turn'] = (X['vol'] / X['shrout']) + X['turn'] = np.where(X['turn'] == 0, np.inf, X['turn']) + zerotrade = (X['countzero']+((1/X['turn'])/480000))*21/X['vol'].count() + df.loc[index, 'zerotrade'] = zerotrade + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_baspread, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['zerotrade']) # drop NA due to rolling +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'zerotrade']] + +with open('zerotrade.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/py-dgtw/dgtw.py b/py-dgtw/dgtw.py new file mode 100755 index 0000000..fe99944 --- /dev/null +++ b/py-dgtw/dgtw.py @@ -0,0 +1,479 @@ +########################################## +# Characteristics-Based Benchmarks # +# May 2018 # +# Qingyi (Freda) Song Drechsler # +########################################## + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +import psycopg2 +import matplotlib.pyplot as plt +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +from scipy import stats + +################### +# Connect to WRDS # +################### +conn=wrds.Connection() + +################### +# CRSP Block # +################### +# sql similar to crspmerge macro + +crsp_m = conn.raw_sql(""" + select a.permno, a.permco, b.ncusip, a.date, + b.shrcd, b.exchcd, b.siccd, + a.ret, a.vol, a.shrout, a.prc, a.cfacpr, a.cfacshr + from crsp.msf as a + left join crsp.msenames as b + on a.permno=b.permno + and b.namedt<=a.date + and a.date<=b.nameendt + where a.date between '01/01/1970' and '12/31/2017' + and b.shrcd between 10 and 11 + """) + +# change variable format to int +crsp_m[['permco','permno','shrcd','exchcd']]=\ + crsp_m[['permco','permno','shrcd','exchcd']].astype(int) + +# Line up date to be end of month +crsp_m['date']=pd.to_datetime(crsp_m['date']) +crsp_m['jdate']=crsp_m['date']+MonthEnd(0) +crsp_m['p']=crsp_m['prc'].abs()/crsp_m['cfacpr'] # price adjusted +crsp_m['tso']=crsp_m['shrout']*crsp_m['cfacshr']*1e3 # total shares out adjusted +crsp_m['me'] = crsp_m['p']*crsp_m['tso']/1e6 # market cap in $mil + +# sum of me across different permno belonging to same permco a given date +crsp_summe = crsp_m.groupby(['jdate','permco'])['me'].sum().reset_index()\ + .rename(columns={'me':'me_comp'}) +crsp_m=pd.merge(crsp_m, crsp_summe, how='inner', on=['jdate','permco']) + + +################### +# Compustat Block # +################### + +comp = conn.raw_sql(""" + select gvkey, datadate, cusip, + sich, seq, pstkrv, pstkl, pstk, txdb, itcb + from comp.funda + where indfmt='INDL' + and datafmt='STD' + and popsrc='D' + and consol='C' + and datadate >= '01/01/1970' + """) + +comp['datadate']=pd.to_datetime(comp['datadate']) #convert datadate to date fmt +comp['year']=comp['datadate'].dt.year + +comp = comp[comp['seq']>0] + +# create preferrerd stock: +# 1st choice: Preferred stock - Redemption Value +# 2nd choice: Preferred stock - Liquidating Value +# 3rd choice: Preferred stock - Carrying Value, Stock (Capital) - Total +comp['pref']=np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv']) +comp['pref']=np.where(comp['pref'].isnull(),comp['pstk'], comp['pref']) +comp['pref']=np.where(comp['pref'].isnull(),0,comp['pref']) + +# fill in missing values for deferred taxes and investment tax credit +comp['txdb']=comp['txdb'].fillna(0) +comp['itcb']=comp['itcb'].fillna(0) + +# create book equity +# Daniel and Titman (JF 1997): +# BE = stockholders' equity + deferred taxes + investment tax credit - Preferred Stock +comp['be']=comp['seq']+comp['txdb']+comp['itcb']-comp['pref'] + +# keep only records with non-negative book equity +comp = comp[comp['be']>=0] +comp=comp[['gvkey','datadate','year','be','sich']] + + +######################### +# Add Historical PERMCO # +######################### +ccm=conn.raw_sql(""" + select gvkey, lpermco as permco, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where (linktype ='LU' or linktype='LC') + """) + +ccm['linkdt']=pd.to_datetime(ccm['linkdt']) +ccm['linkenddt']=pd.to_datetime(ccm['linkenddt']) +# if linkenddt is missing then set to today date +ccm['linkenddt']=ccm['linkenddt'].fillna(pd.to_datetime('today')) + +ccm1=pd.merge(comp,ccm,how='left',on=['gvkey']) +ccm1['jdate']=ccm1['datadate']+MonthEnd(0) +ccm1['year']=ccm1.datadate.dt.year + +# set link date bounds +comp2=ccm1[(ccm1['datadate']>=ccm1['linkdt'])&(ccm1['datadate']<=ccm1['linkenddt'])] +comp2=comp2[['gvkey','permco','datadate', 'year','jdate', 'be', 'sich', 'linkprim']] + + +# link comp and crsp to calculate book-to-market ratio each fiscal year end +comp3=pd.merge(comp2, crsp_m[['permno','permco','date','jdate','siccd','me','me_comp']],\ + how='inner', on=['permco', 'jdate']) +comp3['bm']=comp3['be'].div(comp3['me_comp']) + +comp3 = comp3.sort_values(['permno', 'year', 'datadate', 'linkprim', 'bm'])\ + .drop_duplicates() + +# pick max datadate for a given permno year combo (firm changes fiscal period) +maxdatadate=comp3.groupby(['permno','year'])['datadate'].max()\ + .reset_index() + +comp3 = pd.merge(comp3, maxdatadate, how='inner', on=['permno','year','datadate']) + +######################### +# Assign Fama-French 48 # +######################### + +# function to assign ffi48 classification +def ffi48(row): + if (100<=row['sic'] <=299) or (700<=row['sic']<=799) or (910<=row['sic']<=919) or (row['sic']==2048): + ffi48=1 + ffi48_desc='Agric' + elif (2000<=row['sic']<=2046) or (2050<=row['sic']<=2063) or (2070<=row['sic']<=2079)\ + or (2090<=row['sic']<=2092) or (row['sic']==2095) or (2098<=row['sic']<=2099): + ffi48=2 + ffi48_desc='Food' + elif (2064<=row['sic']<=2068) or (2086<=row['sic']<=2087) or (2096<=row['sic']<=2097): + ffi48=3 + ffi48_desc='Soda' + elif (row['sic']==2080) or (2082<=row['sic']<=2085): + ffi48=4 + ffi48_desc='Beer' + elif (2100<=row['sic']<=2199): + ffi48=5 + ffi48_desc='Smoke' + elif (920<=row['sic']<=999) or (3650<=row['sic']<=3652) or (row['sic']==3732) or (3930<=row['sic']<=3931) or (3940<=row['sic']<=3949): + ffi48=6 + ffi48_desc='Toys' + elif (7800<=row['sic']<=7833) or (7840<=row['sic']<=7841) or(row['sic']==7900)or (7910<=row['sic']<=7911) or (7920<=row['sic']<=7933)\ + or (7940<=row['sic']<=7949) or (row['sic']==7980) or (7990<=row['sic']<=7999): + ffi48=7 + ffi48_desc='Fun' + elif (2700<=row['sic']<=2749) or (2770<=row['sic']<=2771) or (2780<=row['sic']<=2799): + ffi48=8 + ffi48_desc='Books' + elif (row['sic']==2047) or (2391<=row['sic']<=2392) or (2510<=row['sic']<=2519) or (2590<=row['sic']<=2599) or (2840<=row['sic']<=2844)\ + or (3160<=row['sic']<=3161) or (3170<=row['sic']<=3172) or (3190<=row['sic']<=3199) or (row['sic']==3229) or (row['sic']==3260)\ + or (3262<=row['sic']<=3263) or (row['sic']==3269) or (3230<=row['sic']<=3231) or(3630<=row['sic']<=3639) or (3750<=row['sic']<=3751)\ + or (row['sic']==3800) or (3860<=row['sic']<=3861) or (3870<=row['sic']<=3873) or (3910<=row['sic']<=3911) or (3914<=row['sic']<=3915)\ + or (3960<=row['sic']<=3962) or (row['sic']==3991) or (row['sic']==3995): + ffi48=9 + ffi48_desc='Hshld' + elif (2300<=row['sic']<=2390) or (3020<=row['sic']<=3021) or (3100<=row['sic']<=3111)\ + or (3130<=row['sic']<=3131) or (3140<=row['sic']<=3151) or (3963<=row['sic']<=3965): + ffi48=10 + ffi48_desc='Clths' + elif (8000<=row['sic']<=8099): + ffi48=11 + ffi48_desc='Hlth' + elif (row['sic']==3693) or (3840<=row['sic']<=3851): + ffi48=12 + ffi48_desc='MedEq' + elif (2830<=row['sic']<=2831) or (2833<=row['sic']<=2836): + ffi48=13 + ffi48_desc='Drugs' + elif (2800<=row['sic']<=2829) or (2850<=row['sic']<=2879) or (2890<=row['sic']<=2899): + ffi48=14 + ffi48_desc='Chems' + elif (row['sic']==3031) or (row['sic']==3041) or (3050<=row['sic']<=3053) or (3060<=row['sic']<=3069) or (3070<=row['sic']<=3099): + ffi48=15 + ffi48_desc='Rubbr' + elif (2200<=row['sic']<=2284) or (2290<=row['sic']<=2295) or (2297<=row['sic']<=2299) or (2393<=row['sic']<=2395) or (2397<=row['sic']<=2399): + ffi48=16 + ffi48_desc='Txtls' + elif (800<=row['sic']<=899) or (2400<=row['sic']<=2439) or (2450<=row['sic']<=2459) or (2490<=row['sic']<=2499) or (2660<=row['sic']<=2661)\ + or (2950<=row['sic']<=2952) or (row['sic']==3200) or (3210<=row['sic']<=3211) or (3240<=row['sic']<=3241) or (3250<=row['sic']<=3259)\ + or (row['sic']==3261) or (row['sic']==3264) or (3270<=row['sic']<=3275) or (3280<=row['sic']<=3281) or (3290<=row['sic']<=3293)\ + or (3295<=row['sic']<=3299) or (3420<=row['sic']<=3433) or (3440<=row['sic']<=3442) or (row['sic']==3446) or (3448<=row['sic']<=3452)\ + or (3490<=row['sic']<=3499) or (row['sic']==3996): + ffi48=17 + ffi48_desc='BldMt' + elif (1500<=row['sic']<=1511) or (1520<=row['sic']<=1549) or (1600<=row['sic']<=1799): + ffi48=18 + ffi48_desc='Cnstr' + elif (row['sic']==3300) or (3310<=row['sic']<=3317) or (3320<=row['sic']<=3325) or (3330<=row['sic']<=3341) or(3350<=row['sic']<=3357)\ + or (3360<=row['sic']<=3379) or (3390<=row['sic']<=3399): + ffi48=19 + ffi48_desc='Steel' + elif (row['sic']==3400) or (3443<=row['sic']<=3444) or (3460<=row['sic']<=3479): + ffi48=20 + ffi48_desc='FabPr' + elif (3510<=row['sic']<=3536) or (row['sic']==3538) or (3540<=row['sic']<=3569)\ + or (3580<=row['sic']<=3582) or (3585<=row['sic']<=3586) or (3589<=row['sic']<=3599): + ffi48=21 + ffi48_desc='Mach' + elif (row['sic']==3600) or (3610<=row['sic']<=3613) or (3620<=row['sic']<=3621) or (3623<=row['sic']<=3629) or (3640<=row['sic']<=3646)\ + or (3648<=row['sic']<=3649) or (row['sic']==3660) or (3690<=row['sic']<=3692) or (row['sic']==3699): + ffi48=22 + ffi48_desc='ElcEq' + elif (row['sic']==2296) or (row['sic']==2396) or (3010<=row['sic']<=3011) or (row['sic']==3537) or (row['sic']==3647) or (row['sic']==3694)\ + or (row['sic']==3700) or (3710<=row['sic']<=3711) or (3713<=row['sic']<=3716) or (3790<=row['sic']<=3792) or (row['sic']==3799): + ffi48=23 + ffi48_desc='Autos' + elif (3720<=row['sic']<=3721) or (3723<=row['sic']<=3725) or (3728<=row['sic']<=3729): + ffi48=24 + ffi48_desc='Aero' + elif (3730<=row['sic']<=3731) or (3740<=row['sic']<=3743): + ffi48=25 + ffi48_desc='Ships' + elif (3760<=row['sic']<=3769) or (row['sic']==3795) or (3480<=row['sic']<=3489): + ffi48=26 + ffi48_desc='Guns' + elif (1040<=row['sic']<=1049): + ffi48=27 + ffi48_desc='Gold' + elif (1000<=row['sic']<=1039) or (1050<=row['sic']<=1119) or (1400<=row['sic']<=1499): + ffi48=28 + ffi48_desc='Mines' + elif (1200<=row['sic']<=1299): + ffi48=29 + ffi48_desc='Coal' + elif (row['sic']==1300) or (1310<=row['sic']<=1339) or (1370<=row['sic']<=1382) or (row['sic']==1389) or (2900<=row['sic']<=2912) or (2990<=row['sic']<=2999): + ffi48=30 + ffi48_desc='Oil' + elif (row['sic']==4900) or (4910<=row['sic']<=4911) or (4920<=row['sic']<=4925) or (4930<=row['sic']<=4932) or (4939<=row['sic']<=4942): + ffi48=31 + ffi48_desc='Util' + elif (row['sic']==4800) or (4810<=row['sic']<=4813) or (4820<=row['sic']<=4822) or (4830<=row['sic']<=4841) or (4880<=row['sic']<=4892) or (row['sic']==4899): + ffi48=32 + ffi48_desc='Telcm' + elif (7020<=row['sic']<=7021) or (7030<=row['sic']<=7033) or (row['sic']==7200) or (7210<=row['sic']<=7212) or (7214<=row['sic']<=7217)\ + or (7219<=row['sic']<=7221) or (7230<=row['sic']<=7231) or (7240<=row['sic']<=7241) or (7250<=row['sic']<=7251) or (7260<=row['sic']<=7299)\ + or (row['sic']==7395) or (row['sic']==7500) or (7520<=row['sic']<=7549) or (row['sic']==7600) or (row['sic']==7620)\ + or (7622<=row['sic']<=7623) or (7629<=row['sic']<=7631) or (7640<=row['sic']<=7641) or (7690<=row['sic']<=7699) or (8100<=row['sic']<=8499)\ + or (8600<=row['sic']<=8699) or (8800<=row['sic']<=8899) or (7510<=row['sic']<=7515): + ffi48=33 + ffi48_desc='PerSv' + elif (2750<=row['sic']<=2759) or (row['sic']==3993) or (row['sic']==7218) or (row['sic']==7300) or (7310<=row['sic']<=7342)\ + or (7349<=row['sic']<=7353) or (7359<=row['sic']<=7372) or (7374<=row['sic']<=7385) or (7389<=row['sic']<=7394) or (7396<=row['sic']<=7397)\ + or (row['sic']==7399) or (row['sic']==7519) or (row['sic']==8700) or (8710<=row['sic']<=8713) or (8720<=row['sic']<=8721) \ + or (8730<=row['sic']<=8734) or (8740<=row['sic']<=8748) or (8900<=row['sic']<=8911) or (8920<=row['sic']<=8999) or (4220<=row['sic']<=4229): + ffi48=34 + ffi48_desc='BusSv' + elif (3570<=row['sic']<=3579) or (3680<=row['sic']<=3689) or (row['sic']==3695) or (row['sic']==7373): + ffi48=35 + ffi48_desc='Comps' + elif (row['sic']==3622) or (3661<=row['sic']<=3666) or (3669<=row['sic']<=3679) or (row['sic']==3810) or (row['sic']==3812): + ffi48=36 + ffi48_desc='Chips' + elif (row['sic']==3811) or (3820<=row['sic']<=3827) or (3829<=row['sic']<=3839): + ffi48=37 + ffi48_desc='LabEq' + elif (2520<=row['sic']<=2549) or (2600<=row['sic']<=2639) or (2670<=row['sic']<=2699) or (2760<=row['sic']<=2761) or (3950<=row['sic']<=3955): + ffi48=38 + ffi48_desc='Paper' + elif (2440<=row['sic']<=2449) or (2640<=row['sic']<=2659) or (3220<=row['sic']<=3221) or (3410<=row['sic']<=3412): + ffi48=39 + ffi48_desc='Boxes' + elif (4000<=row['sic']<=4013) or (4040<=row['sic']<=4049) or (row['sic']==4100) or (4110<=row['sic']<=4121) or (4130<=row['sic']<=4131)\ + or (4140<=row['sic']<=4142) or (4150<=row['sic']<=4151) or (4170<=row['sic']<=4173) or (4190<=row['sic']<=4200)\ + or (4210<=row['sic']<=4219) or (4230<=row['sic']<=4231) or (4240<=row['sic']<=4249) or (4400<=row['sic']<=4700) or (4710<=row['sic']<=4712)\ + or (4720<=row['sic']<=4749) or (row['sic']==4780) or (4782<=row['sic']<=4785) or (row['sic']==4789): + ffi48=40 + ffi48_desc='Trans' + elif (row['sic']==5000) or (5010<=row['sic']<=5015) or (5020<=row['sic']<=5023) or (5030<=row['sic']<=5060) or (5063<=row['sic']<=5065)\ + or (5070<=row['sic']<=5078) or (5080<=row['sic']<=5088) or (5090<=row['sic']<=5094) or (5099<=row['sic']<=5100)\ + or (5110<=row['sic']<=5113) or (5120<=row['sic']<=5122) or (5130<=row['sic']<=5172) or (5180<=row['sic']<=5182) or (5190<=row['sic']<=5199): + ffi48=41 + ffi48_desc='Whlsl' + elif (row['sic']==5200) or (5210<=row['sic']<=5231) or (5250<=row['sic']<=5251) or (5260<=row['sic']<=5261) or (5270<=row['sic']<=5271)\ + or (row['sic']==5300) or (5310<=row['sic']<=5311) or (row['sic']==5320) or (5330<=row['sic']<=5331) or (row['sic']==5334)\ + or (5340<=row['sic']<=5349) or (5390<=row['sic']<=5400) or (5410<=row['sic']<=5412) or (5420<=row['sic']<=5469) or (5490<=row['sic']<=5500)\ + or (5510<=row['sic']<=5579) or (5590<=row['sic']<=5700) or (5710<=row['sic']<=5722) or (5730<=row['sic']<=5736) or (5750<=row['sic']<=5799)\ + or (row['sic']==5900) or (5910<=row['sic']<=5912) or (5920<=row['sic']<=5932) or (5940<=row['sic']<=5990) or (5992<=row['sic']<=5995) or (row['sic']==5999): + ffi48=42 + ffi48_desc='Rtail' + elif (5800<=row['sic']<=5829) or (5890<=row['sic']<=5899) or (row['sic']==7000) or (7010<=row['sic']<=7019) or (7040<=row['sic']<=7049) or (row['sic']==7213): + ffi48=43 + ffi48_desc='Meals' + elif (row['sic']==6000) or (6010<=row['sic']<=6036) or (6040<=row['sic']<=6062) or (6080<=row['sic']<=6082) or (6090<=row['sic']<=6100)\ + or (6110<=row['sic']<=6113) or (6120<=row['sic']<=6179) or (6190<=row['sic']<=6199): + ffi48=44 + ffi48_desc='Banks' + elif (row['sic']==6300) or (6310<=row['sic']<=6331) or (6350<=row['sic']<=6351) or (6360<=row['sic']<=6361) or (6370<=row['sic']<=6379) or (6390<=row['sic']<=6411): + ffi48=45 + ffi48_desc='Insur' + elif (row['sic']==6500) or (row['sic']==6510) or (6512<=row['sic']<=6515) or (6517<=row['sic']<=6532) or (6540<=row['sic']<=6541)\ + or (6550<=row['sic']<=6553) or (6590<=row['sic']<=6599) or (6610<=row['sic']<=6611): + ffi48=46 + ffi48_desc='RlEst' + elif (6200<=row['sic']<=6299) or (row['sic']==6700) or (6710<=row['sic']<=6726) or (6730<=row['sic']<=6733) or (6740<=row['sic']<=6779)\ + or (6790<=row['sic']<=6795) or (6798<=row['sic']<=6799): + ffi48=47 + ffi48_desc='Fin' + elif (4950<=row['sic']<=4961) or (4970<=row['sic']<=4971) or (4990<=row['sic']<=4991) or (row['sic']==9999): + ffi48=48 + ffi48_desc='Other' + else: + ffi48=np.nan + ffi48_desc='' + return pd.Series({'sic': row['sic'], 'ffi48': ffi48, 'ffi48_desc': ffi48_desc}) + +# assign SIC code +comp4 = comp3 +# First use historical Compustat SIC Code +# Then if missing use historical CRSP SIC Code +comp4['sic']=np.where(comp4['sich']>0, comp4['sich'], comp4['siccd']) + +# and adjust some SIC code to fit F&F 48 ind delineation +comp4['sic']=np.where((comp4['sic'].isin([3990, 9995, 9997])) & (comp4['siccd']>0) & (comp4['sic'] != comp4['siccd']), \ + comp4['siccd'], comp4['sic']) +comp4['sic']=np.where(comp4['sic'].isin([3990,3999]), 3991, comp4['sic']) +comp4['sic']=comp4.sic.astype(int) + +# assign the ffi48 function to comp4 +_sic = comp4['sic'].unique() +_sicff = pd.DataFrame(_sic).rename(columns={0:'sic'}) +_sicff = _sicff.apply(ffi48, axis=1) +comp4 = pd.merge(comp4, _sicff, how='left', on=['sic']) + +# keep only records with non-missing bm and ffi48 classification +comp4 = comp4[(comp4['bm'] != np.NaN) & (comp4['ffi48_desc'] !='')] +comp4 = comp4.drop(['sich','siccd','datadate'], axis=1) +comp4=comp4.sort_values(['ffi48','year']) + + +######################### +# Industry BM Average # +######################### + +# Calculate BM Industry Average Each Period +comp4_tmp = comp4[(comp4['ffi48']>0)&(comp4['bm']>=0)] +bm_ind = comp4_tmp.groupby(['ffi48','year'])['bm'].mean().reset_index().rename(columns={'bm':'bmind'}) + +# Calculate Long-Term Industry BtM Average +bm_ind['n'] = bm_ind.groupby(['ffi48'])['year'].cumcount() +bm_ind['sumbm']=bm_ind.groupby(['ffi48'])['bmind'].cumsum() +bm_ind['bmavg'] = bm_ind['sumbm']/(bm_ind['n']+1) +bm_ind = bm_ind.drop(['n','sumbm'], axis=1) + +# Adjust Firm-Specific BtM with Industry Averages +comp5 = pd.merge(comp4, bm_ind, how='left',on=['ffi48','year']) +comp5['bm_adj'] = comp5['bm']-comp5['bmavg'] + + +######################### +# Momentum Factor # +######################### + +# Create (12,1) Momentum Factor with at least 6 months of returns +_tmp_crsp = crsp_m[['permno','date','ret', 'me', 'exchcd']].sort_values(['permno','date']).set_index('date') +#replace missing return with 0 +_tmp_crsp['ret']=_tmp_crsp['ret'].fillna(0) +_tmp_crsp['logret']=np.log(1+_tmp_crsp['ret']) +_tmp_cumret = _tmp_crsp.groupby(['permno'])['logret'].rolling(12, min_periods=7).sum() +_tmp_cumret = _tmp_cumret.reset_index() +_tmp_cumret['cumret']=np.exp(_tmp_cumret['logret'])-1 + +sizemom = pd.merge(_tmp_crsp.reset_index(), _tmp_cumret[['permno','date','cumret']], how='left', on=['permno','date']) +sizemom['mom']=sizemom.groupby('permno')['cumret'].shift(1) +sizemom=sizemom[sizemom['date'].dt.month==6].drop(['logret','cumret'], axis=1).rename(columns={'me':'size'}) + + +######################### +# NYSE Size Breakpoint # +######################### + +# Get Size Breakpoints for NYSE firms +sizemom=sizemom.sort_values(['date','permno']).drop_duplicates() +nyse = sizemom[sizemom['exchcd']==1] +nyse_break = nyse.groupby(['date'])['size'].describe(percentiles=[.2,.4,.6,.8]).reset_index() +nyse_break = nyse_break[['date','20%','40%','60%','80%']]\ +.rename(columns={'20%':'dec20', '40%':'dec40', '60%':'dec60','80%':'dec80'}) + +sizemom = pd.merge(sizemom, nyse_break, how='left', on='date') + +# Add NYSE Size Breakpoints to the Data +def size_group(row): + if 0<=row['size'] < row['dec20']: + value = 1 + elif row['size'] < row['dec40']: + value=2 + elif row['size'] < row['dec60']: + value=3 + elif row['size'] < row['dec80']: + value=4 + elif row['size'] >= row['dec80']: + value=5 + else: + value=np.nan + return value + +sizemom['group']=sizemom.apply(size_group, axis=1) +sizemom['year']=sizemom['date'].dt.year-1 +sizemom=sizemom[['permno','date','year','mom','group','size','ret']] + +# Adjusted BtM from the calendar year preceding the formation date +comp6=comp5[['gvkey','permno','year','bm_adj']] +comp6=pd.merge(comp6, sizemom, how='inner', on=['permno','year']) +comp6=comp6.dropna(subset=['size','mom','bm_adj','ret'], how='any') + +######################### +# Size BM MOM Portfolio # +######################### + +# Start the Triple Sort on Size, Book-to-Market, and Momentum +port1=comp6.sort_values(['date','group','permno']).drop_duplicates() +port1['bmr']=port1.groupby(['date','group'])['bm_adj'].transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop')) +port2 = port1.sort_values(['date','group','bmr']) +port2['momr']=port2.groupby(['date','group','bmr'])['mom'].transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop')) + +# DGTW_PORT 1 for Bottom Quintile, 5 for Top Quintile +port3=port2 +port3['bmr']=port3['bmr']+1 +port3['momr']=port3['momr']+1 +port3[['group','bmr','momr']]=port3[['group','bmr','momr']].astype(int).astype(str) +port3['dgtw_port']=port3['group']+port3['bmr']+port3['momr'] +port4 = port3[['permno','gvkey','date','size','mom','bm_adj','dgtw_port']] +port4['date']=port4['date']+MonthEnd(0) +port4['jyear']=port4['date'].dt.year +port4=port4.sort_values(['permno','date']) +port4=port4.rename(columns={'date':'formdate', 'size':'sizew'}) +port4=port4[['permno','formdate','jyear','sizew','dgtw_port']] + +crsp_m1= crsp_m[['permno','date','ret']] +crsp_m1['date']=crsp_m1['date']+MonthEnd(0) +crsp_m1['jdate']=crsp_m1['date']+MonthEnd(-6) +crsp_m1['jyear']=crsp_m1['jdate'].dt.year + +crsp_m1 = pd.merge(crsp_m1.drop(['jdate'],axis=1), port4, how='left', on=['permno','jyear']) +crsp_m1 = crsp_m1.dropna(subset=['formdate','sizew','dgtw_port'], how='any') + +crsp_m1 = crsp_m1.sort_values(['date','dgtw_port','permno']) + +# function to calculate value weighted return +def wavg(group, avg_name, weight_name): + d = group[avg_name] + w = group[weight_name] + try: + return (d * w).sum() / w.sum() + except ZeroDivisionError: + return np.nan +# Calculate Weighted Average Returns +dgtw_vwret = crsp_m1.groupby(['date','dgtw_port']).apply(wavg, 'ret','sizew') +dgtw_vwret = dgtw_vwret.reset_index().rename(columns={0:'dgtw_vwret'}) + +# Calculate DGTW Excess Return +dgtw_returns = pd.merge(crsp_m1.drop(['sizew'], axis=1), dgtw_vwret, how='left', on =['dgtw_port','date']) +dgtw_returns['dgtw_xret']=dgtw_returns['ret']-dgtw_returns['dgtw_vwret'] +dgtw_returns = dgtw_returns.sort_values(['permno','date']).drop_duplicates() + +### output + +#dgtw_vwret.to_csv('dgtw_-py-vwret.csv') +dgtw_returns.to_csv('dgtw-py-xret.csv') +#crsp_m1.to_csv('dgtw-py-label.csv') diff --git a/py-ff3/ff3.py b/py-ff3/ff3.py new file mode 100755 index 0000000..528ce54 --- /dev/null +++ b/py-ff3/ff3.py @@ -0,0 +1,280 @@ +########################################## +# Fama French Factors +# April 2018 +# Qingyi (Freda) Song Drechsler +########################################## + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +import psycopg2 +import matplotlib.pyplot as plt +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +from scipy import stats + +################### +# Connect to WRDS # +################### +conn=wrds.Connection() + +################### +# Compustat Block # +################### +comp = conn.raw_sql(""" + select gvkey, datadate, at, pstkl, txditc, + pstkrv, seq, pstk + from comp.funda + where indfmt='INDL' + and datafmt='STD' + and popsrc='D' + and consol='C' + and datadate >= '01/01/1959' + """) + +comp['datadate']=pd.to_datetime(comp['datadate']) #convert datadate to date fmt +comp['year']=comp['datadate'].dt.year + +# create preferrerd stock +comp['ps']=np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv']) +comp['ps']=np.where(comp['ps'].isnull(),comp['pstk'], comp['ps']) +comp['ps']=np.where(comp['ps'].isnull(),0,comp['ps']) + +comp['txditc']=comp['txditc'].fillna(0) + +# create book equity +comp['be']=comp['seq']+comp['txditc']-comp['ps'] +comp['be']=np.where(comp['be']>0, comp['be'], np.nan) + +# number of years in Compustat +comp=comp.sort_values(by=['gvkey','datadate']) +comp['count']=comp.groupby(['gvkey']).cumcount() + +comp=comp[['gvkey','datadate','year','be','count']] + +################### +# CRSP Block # +################### +# sql similar to crspmerge macro +crsp_m = conn.raw_sql(""" + select a.permno, a.permco, a.date, b.shrcd, b.exchcd, + a.ret, a.retx, a.shrout, a.prc + from crsp.msf as a + left join crsp.msenames as b + on a.permno=b.permno + and b.namedt<=a.date + and a.date<=b.nameendt + where a.date between '01/01/1959' and '12/31/2017' + and b.exchcd between 1 and 3 + """) + +# change variable format to int +crsp_m[['permco','permno','shrcd','exchcd']]=crsp_m[['permco','permno','shrcd','exchcd']].astype(int) + +# Line up date to be end of month +crsp_m['date']=pd.to_datetime(crsp_m['date']) +crsp_m['jdate']=crsp_m['date']+MonthEnd(0) + +# add delisting return +dlret = conn.raw_sql(""" + select permno, dlret, dlstdt + from crsp.msedelist + """) +dlret.permno=dlret.permno.astype(int) +dlret['dlstdt']=pd.to_datetime(dlret['dlstdt']) +dlret['jdate']=dlret['dlstdt']+MonthEnd(0) + +crsp = pd.merge(crsp_m, dlret, how='left',on=['permno','jdate']) +crsp['dlret']=crsp['dlret'].fillna(0) +crsp['ret']=crsp['ret'].fillna(0) +crsp['retadj']=(1+crsp['ret'])*(1+crsp['dlret'])-1 +crsp['me']=crsp['prc'].abs()*crsp['shrout'] # calculate market equity +crsp=crsp.drop(['dlret','dlstdt','prc','shrout'], axis=1) +crsp=crsp.sort_values(by=['jdate','permco','me']) + +### Aggregate Market Cap ### +# sum of me across different permno belonging to same permco a given date +crsp_summe = crsp.groupby(['jdate','permco'])['me'].sum().reset_index() +# largest mktcap within a permco/date +crsp_maxme = crsp.groupby(['jdate','permco'])['me'].max().reset_index() +# join by jdate/maxme to find the permno +crsp1=pd.merge(crsp, crsp_maxme, how='inner', on=['jdate','permco','me']) +# drop me column and replace with the sum me +crsp1=crsp1.drop(['me'], axis=1) +# join with sum of me to get the correct market cap info +crsp2=pd.merge(crsp1, crsp_summe, how='inner', on=['jdate','permco']) +# sort by permno and date and also drop duplicates +crsp2=crsp2.sort_values(by=['permno','jdate']).drop_duplicates() + +# keep December market cap +crsp2['year']=crsp2['jdate'].dt.year +crsp2['month']=crsp2['jdate'].dt.month +decme=crsp2[crsp2['month']==12] +decme=decme[['permno','date','jdate','me','year']].rename(columns={'me':'dec_me'}) + +### July to June dates +crsp2['ffdate']=crsp2['jdate']+MonthEnd(-6) +crsp2['ffyear']=crsp2['ffdate'].dt.year +crsp2['ffmonth']=crsp2['ffdate'].dt.month +crsp2['1+retx']=1+crsp2['retx'] +crsp2=crsp2.sort_values(by=['permno','date']) + +# cumret by stock +crsp2['cumretx']=crsp2.groupby(['permno','ffyear'])['1+retx'].cumprod() +# lag cumret +crsp2['lcumretx']=crsp2.groupby(['permno'])['cumretx'].shift(1) + +# lag market cap +crsp2['lme']=crsp2.groupby(['permno'])['me'].shift(1) + +# if first permno then use me/(1+retx) to replace the missing value +crsp2['count']=crsp2.groupby(['permno']).cumcount() +crsp2['lme']=np.where(crsp2['count']==0, crsp2['me']/crsp2['1+retx'], crsp2['lme']) + +# baseline me +mebase=crsp2[crsp2['ffmonth']==1][['permno','ffyear', 'lme']].rename(columns={'lme':'mebase'}) + +# merge result back together +crsp3=pd.merge(crsp2, mebase, how='left', on=['permno','ffyear']) +crsp3['wt']=np.where(crsp3['ffmonth']==1, crsp3['lme'], crsp3['mebase']*crsp3['lcumretx']) + +decme['year']=decme['year']+1 +decme=decme[['permno','year','dec_me']] + +# Info as of June +crsp3_jun = crsp3[crsp3['month']==6] + +crsp_jun = pd.merge(crsp3_jun, decme, how='inner', on=['permno','year']) +crsp_jun=crsp_jun[['permno','date', 'jdate', 'shrcd','exchcd','retadj','me','wt','cumretx','mebase','lme','dec_me']] +crsp_jun=crsp_jun.sort_values(by=['permno','jdate']).drop_duplicates() + +####################### +# CCM Block # +####################### +ccm=conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where substr(linktype,1,1)='L' + and (linkprim ='C' or linkprim='P') + """) + +ccm['linkdt']=pd.to_datetime(ccm['linkdt']) +ccm['linkenddt']=pd.to_datetime(ccm['linkenddt']) +# if linkenddt is missing then set to today date +ccm['linkenddt']=ccm['linkenddt'].fillna(pd.to_datetime('today')) + +ccm1=pd.merge(comp[['gvkey','datadate','be', 'count']],ccm,how='left',on=['gvkey']) +ccm1['yearend']=ccm1['datadate']+YearEnd(0) +ccm1['jdate']=ccm1['yearend']+MonthEnd(6) + +# set link date bounds +ccm2=ccm1[(ccm1['jdate']>=ccm1['linkdt'])&(ccm1['jdate']<=ccm1['linkenddt'])] +ccm2=ccm2[['gvkey','permno','datadate','yearend', 'jdate','be', 'count']] + +# link comp and crsp +ccm_jun=pd.merge(crsp_jun, ccm2, how='inner', on=['permno', 'jdate']) +ccm_jun['beme']=ccm_jun['be']*1000/ccm_jun['dec_me'] + +# select NYSE stocks for bucket breakdown +# exchcd = 1 and positive beme and positive me and shrcd in (10,11) and at least 2 years in comp +nyse=ccm_jun[(ccm_jun['exchcd']==1) & (ccm_jun['beme']>0) & (ccm_jun['me']>0) & (ccm_jun['count']>1) & ((ccm_jun['shrcd']==10) | (ccm_jun['shrcd']==11))] +# size breakdown +nyse_sz=nyse.groupby(['jdate'])['me'].median().to_frame().reset_index().rename(columns={'me':'sizemedn'}) +# beme breakdown +nyse_bm=nyse.groupby(['jdate'])['beme'].describe(percentiles=[0.3, 0.7]).reset_index() +nyse_bm=nyse_bm[['jdate','30%','70%']].rename(columns={'30%':'bm30', '70%':'bm70'}) + +nyse_breaks = pd.merge(nyse_sz, nyse_bm, how='inner', on=['jdate']) +# join back size and beme breakdown +ccm1_jun = pd.merge(ccm_jun, nyse_breaks, how='left', on=['jdate']) + + +# function to assign sz and bm bucket +def sz_bucket(row): + if row['me']==np.nan: + value='' + elif row['me']<=row['sizemedn']: + value='S' + else: + value='B' + return value + +def bm_bucket(row): + if 0<=row['beme']<=row['bm30']: + value = 'L' + elif row['beme']<=row['bm70']: + value='M' + elif row['beme']>row['bm70']: + value='H' + else: + value='' + return value + +# assign size portfolio +ccm1_jun['szport']=np.where((ccm1_jun['beme']>0)&(ccm1_jun['me']>0)&(ccm1_jun['count']>=1), ccm1_jun.apply(sz_bucket, axis=1), '') +# assign book-to-market portfolio +ccm1_jun['bmport']=np.where((ccm1_jun['beme']>0)&(ccm1_jun['me']>0)&(ccm1_jun['count']>=1), ccm1_jun.apply(bm_bucket, axis=1), '') +# create positivebmeme and nonmissport variable +ccm1_jun['posbm']=np.where((ccm1_jun['beme']>0)&(ccm1_jun['me']>0)&(ccm1_jun['count']>=1), 1, 0) +ccm1_jun['nonmissport']=np.where((ccm1_jun['bmport']!=''), 1, 0) + +# store portfolio assignment as of June +june=ccm1_jun[['permno','date', 'jdate', 'bmport','szport','posbm','nonmissport']] +june['ffyear']=june['jdate'].dt.year + +# merge back with monthly records +crsp3 = crsp3[['date','permno','shrcd','exchcd','retadj','me','wt','cumretx','ffyear','jdate']] +ccm3=pd.merge(crsp3, + june[['permno','ffyear','szport','bmport','posbm','nonmissport']], how='left', on=['permno','ffyear']) + +# keeping only records that meet the criteria +ccm4=ccm3[(ccm3['wt']>0)& (ccm3['posbm']==1) & (ccm3['nonmissport']==1) & + ((ccm3['shrcd']==10) | (ccm3['shrcd']==11))] + +############################ +# Form Fama French Factors # +############################ + +# function to calculate value weighted return +def wavg(group, avg_name, weight_name): + d = group[avg_name] + w = group[weight_name] + try: + return (d * w).sum() / w.sum() + except ZeroDivisionError: + return np.nan + +# value-weigthed return +vwret=ccm4.groupby(['jdate','szport','bmport']).apply(wavg, 'retadj','wt').to_frame().reset_index().rename(columns={0: 'vwret'}) +vwret['sbport']=vwret['szport']+vwret['bmport'] + +# firm count +vwret_n=ccm4.groupby(['jdate','szport','bmport'])['retadj'].count().reset_index().rename(columns={'retadj':'n_firms'}) +vwret_n['sbport']=vwret_n['szport']+vwret_n['bmport'] + +# tranpose +ff_factors=vwret.pivot(index='jdate', columns='sbport', values='vwret').reset_index() +ff_nfirms=vwret_n.pivot(index='jdate', columns='sbport', values='n_firms').reset_index() + +# create SMB and HML factors +ff_factors['WH']=(ff_factors['BH']+ff_factors['SH'])/2 +ff_factors['WL']=(ff_factors['BL']+ff_factors['SL'])/2 +ff_factors['WHML'] = ff_factors['WH']-ff_factors['WL'] + +ff_factors['WB']=(ff_factors['BL']+ff_factors['BM']+ff_factors['BH'])/3 +ff_factors['WS']=(ff_factors['SL']+ff_factors['SM']+ff_factors['SH'])/3 +ff_factors['WSMB'] = ff_factors['WS']-ff_factors['WB'] +ff_factors=ff_factors.rename(columns={'jdate':'date'}) + +# n firm count +ff_nfirms['H']=ff_nfirms['SH']+ff_nfirms['BH'] +ff_nfirms['L']=ff_nfirms['SL']+ff_nfirms['BL'] +ff_nfirms['HML']=ff_nfirms['H']+ff_nfirms['L'] + +ff_nfirms['B']=ff_nfirms['BL']+ff_nfirms['BM']+ff_nfirms['BH'] +ff_nfirms['S']=ff_nfirms['SL']+ff_nfirms['SM']+ff_nfirms['SH'] +ff_nfirms['SMB']=ff_nfirms['B']+ff_nfirms['S'] +ff_nfirms['TOTAL']=ff_nfirms['SMB'] +ff_nfirms=ff_nfirms.rename(columns={'jdate':'date'}) diff --git a/py-pead/pead.py b/py-pead/pead.py new file mode 100755 index 0000000..53ce1c7 --- /dev/null +++ b/py-pead/pead.py @@ -0,0 +1,538 @@ + +##################################### +# Post Earnings Announcement Drift # +# June 2019 # +# Qingyi (Freda) Song Drechsler # +##################################### + +import pandas as pd +import numpy as np +import wrds +import matplotlib.pyplot as plt +import pickle as pkl +from dateutil.relativedelta import * + +################### +# Connect to WRDS # +################### +conn=wrds.Connection() + +# set sample date range +begdate = '01/01/2010' +enddate = '12/31/2018' + +# set CRSP date range a bit wider to guarantee collecting all information +crsp_begdate = '01/01/2009' +crsp_enddate = '12/31/2019' + +################################# +# Step 0: Read in ICLINK output # +################################# + +# iclink.pkl is the output from the python program iclink +# it contains the linking between crsp and ibes +with open('iclink.pkl', 'rb') as f: + iclink = pkl.load(f) + +################################## +# Step 1. S&P 500 Index Universe # +################################## + +# All companies that were ever included in S&P 500 index as an example +# Linking Compustat GVKEY and IBES Tickers using ICLINK +# For unmatched GVKEYs, use header IBTIC link in Compustat Security file + +_sp500 = conn.raw_sql(""" select gvkey from comp.idxcst_his where gvkeyx='000003' """) + +_ccm = conn.raw_sql(""" select gvkey, lpermco as permco, lpermno as permno, linkdt, linkenddt + from crsp.ccmxpf_linktable + where usedflag=1 and linkprim in ('P', 'C')""") + +_ccm[['permco', 'permno']] = _ccm[['permco', 'permno']].astype(int) +_ccm['linkdt'] = pd.to_datetime(_ccm['linkdt']) +_ccm['linkenddt'] = pd.to_datetime(_ccm['linkenddt']) + +_sec = conn.raw_sql(""" select ibtic, gvkey from comp.security """) + + +import datetime +today = datetime.date.today() + +# Fill linkenddt missing value (.E in SAS dataset) with today's date +_ccm['linkenddt'] = _ccm.linkenddt.fillna(today) + +# Start the sequence of left join +gvkey = pd.merge(_sp500, _ccm, how='left', on=['gvkey']) +gvkey = pd.merge(gvkey, _sec.loc[_sec.ibtic.notna()], how='left', on=['gvkey']) + +# high quality links from iclink +# score = 0 or 1 +iclink_hq = iclink.loc[(iclink.score <=1)] + +gvkey = pd.merge(gvkey, iclink_hq, how='left', on=['permno']) + +# fill missing ticker with ibtic +gvkey.ticker = np.where(gvkey.ticker.notnull(), gvkey.ticker, gvkey.ibtic) + +# Keep relevant columns and drop duplicates if there is any +gvkey = gvkey[['gvkey', 'permco', 'permno', 'linkdt', 'linkenddt','ticker']] + +gvkey = gvkey.drop_duplicates() + +# date ranges from gvkey + +# min linkdt for ticker and permno combination +gvkey_mindt = gvkey.groupby(['ticker','permno']).linkdt.min().reset_index() + +# max linkenddt for ticker and permno combination +gvkey_maxdt = gvkey.groupby(['ticker','permno']).linkenddt.max().reset_index() + +# link date range +gvkey_dt = pd.merge(gvkey_mindt, gvkey_maxdt, how='inner', on=['ticker','permno']) + +####################################### +# Step 2. Extract Estimates from IBES # +####################################### + +# Extract estimates from IBES Unadjusted file and select +# the latest estimate for a firm within broker-analyst group +# "fpi in (6,7)" selects quarterly forecast for the current +# and the next fiscal quarter + +ibes_temp = conn.raw_sql(f""" + select ticker, estimator, analys, pdf, fpi, value, fpedats, revdats, revtims, anndats, anntims + from ibes.detu_epsus + where fpedats between '{begdate}' and '{enddate}' + and (fpi='6' or fpi='7') + """, date_cols = ['revdats', 'anndats', 'fpedats']) + +# merge to get date range linkdt and linkenddt to fulfill date requirement +ibes_temp = pd.merge(ibes_temp, gvkey_dt, how='left', on=['ticker']) +ibes_temp=ibes_temp.loc[(ibes_temp.linkdt<=ibes_temp.anndats) & (ibes_temp.anndats <= ibes_temp.linkenddt)] + +# Count number of estimates reported on primary/diluted basis + +p_sub = ibes_temp[['ticker','fpedats','pdf']].loc[ibes_temp.pdf=='P'] +d_sub = ibes_temp[['ticker','fpedats','pdf']].loc[ibes_temp.pdf=='D'] + +p_count = p_sub.groupby(['ticker','fpedats']).pdf.count().reset_index().rename(columns={'pdf':'p_count'}) +d_count = d_sub.groupby(['ticker','fpedats']).pdf.count().reset_index().rename(columns={'pdf':'d_count'}) + +ibes = pd.merge(ibes_temp, d_count, how = 'left', on=['ticker', 'fpedats']) +ibes = pd.merge(ibes, p_count, how='left', on =['ticker','fpedats']) +ibes['d_count'] = ibes.d_count.fillna(0) +ibes['p_count'] = ibes.p_count.fillna(0) + +# Determine whether most analysts report estimates on primary/diluted basis +# following Livnat and Mendenhall (2006) + +ibes['basis']=np.where(ibes.p_count>ibes.d_count, 'P', 'D') + +ibes = ibes.sort_values(by=['ticker','fpedats','estimator','analys','anndats', 'anntims', 'revdats', 'revtims'])\ +.drop(['linkdt', 'linkenddt','p_count','d_count', 'pdf', 'fpi'], axis=1) + +# Keep the latest observation for a given analyst +# Group by company fpedats estimator analys then pick the last record in the group + +ibes_1 = ibes.groupby(['ticker','fpedats','estimator','analys']).apply(lambda x: x.index[-1]).to_frame().reset_index() + +# reset index to the old dataframe index for join in the next step +ibes_1=ibes_1.set_index(0) + +# Inner join with the last analyst record per group +ibes = pd.merge(ibes, ibes_1[['analys']], left_index=True, right_index=True) + +# drop duplicate column +ibes=ibes.drop(['analys_y'], axis=1).rename(columns={'analys_x': 'analys'}) + +####################################### +# Step 3. Link Estimates with Actuals # +####################################### + +# Link Unadjusted estimates with Unadjusted actuals and CRSP permnos +# Keep only the estimates issued within 90 days before the report date + +# Getting actual piece of data +ibes_act = conn.raw_sql(f""" + select ticker, anndats as repdats, value as act, pends as fpedats, pdicity + from ibes.actu_epsus + where pends between '{begdate}' and '{enddate}' + and pdicity='QTR' + """, date_cols = ['repdats', 'fpedats']) + +# Join with the estimate piece of the data + +ibes1 = pd.merge(ibes, ibes_act, how='left', on = ['ticker','fpedats']) +ibes1['dgap'] = ibes1.repdats - ibes1.anndats + +ibes1['flag'] = np.where( (ibes1.dgap>=datetime.timedelta(days=0)) & (ibes1.dgap<=datetime.timedelta(days=90)) & (ibes1.repdats.notna()) & (ibes1.anndats.notna()), 1, 0) + +ibes1 = ibes1.loc[ibes1.flag==1].drop(['flag', 'dgap', 'pdicity'], axis=1) + + +# Select all relevant combinations of Permnos and Date + +ibes1_dt1 = ibes1[['permno', 'anndats']].drop_duplicates() + +ibes1_dt2 = ibes1[['permno', 'repdats']].drop_duplicates().rename(columns={'repdats':'anndats'}) + +ibes_anndats = pd.concat([ibes1_dt1, ibes1_dt2]).drop_duplicates() + +# Adjust all estimate and earnings announcement dates to the closest +# preceding trading date in CRSP to ensure that adjustment factors won't +# be missing after the merge + +# unique anndats from ibes +uniq_anndats = ibes_anndats[['anndats']].drop_duplicates() + +# unique trade dates from crsp.dsi +crsp_dats = conn.raw_sql(""" + select date + from crsp.dsi + """, date_cols=['date']) + +# Create up to 5 days prior dates relative to anndats + +for i in range(0, 5): + uniq_anndats[i] = uniq_anndats.anndats - datetime.timedelta(days=i) + +# reshape (transpose) the df for later join with crsp trading dates + +expand_anndats = uniq_anndats.set_index('anndats').stack().reset_index().\ +rename(columns={'level_1':'prior', 0:'prior_date'}) + +# merge with crsp trading dates +tradedates = pd.merge(expand_anndats, crsp_dats, how='left', left_on=['prior_date'], right_on=['date']) + +# create the dgap (days gap) variable for min selection +tradedates['dgap'] = tradedates.anndats-tradedates.date + +# choosing the row with the smallest dgap for a given anndats +tradedates = tradedates.loc[tradedates.groupby('anndats')['dgap'].idxmin()] + +tradedates = tradedates[['anndats', 'date']] + + +# merge the CRSP adjustment factors for all estimate and report dates + +# extract CRSP adjustment factors +cfacshr = conn.raw_sql(f""" + select permno, date, cfacshr + from crsp.dsf + where date between '{crsp_begdate}' and '{crsp_enddate}' + """, date_cols = ['date']) + +ibes_anndats = pd.merge(ibes_anndats, tradedates, how='left', on = ['anndats']) + +ibes_anndats = pd.merge(ibes_anndats, cfacshr, how='left', on=['permno', 'date']) + + +######################################### +# Step 4. Adjust Estimates with CFACSHR # +######################################### + +# Put the estimate on the same per share basis as +# company reported EPS using CRSP Adjustment factors. +# New_value is the estimate adjusted to be on the +# same basis with reported earnings. + +ibes1 = pd.merge(ibes1, ibes_anndats, how='inner', on=['permno', 'anndats']) +ibes1 = ibes1.drop(['anndats','date'], axis=1).rename(columns={'cfacshr':'cfacshr_ann'}) + +ibes1 = pd.merge(ibes1, ibes_anndats, how='inner', left_on=['permno', 'repdats'], right_on=['permno','anndats']) +ibes1 = ibes1.drop(['anndats','date'], axis=1).rename(columns={'cfacshr':'cfacshr_rep'}) + +ibes1['new_value'] = (ibes1.cfacshr_rep/ibes1.cfacshr_ann)*ibes1.value + +# Sanity check: there should be one most recent estimate for +# a given firm-fiscal period end combination +ibes1 = ibes1.sort_values(by=['ticker','fpedats','estimator','analys']).drop_duplicates() + +# Compute the median forecast based on estimates in the 90 days prior to the EAD + +grp_permno = ibes1.groupby(['ticker','fpedats', 'basis','repdats', 'act']).permno.max().reset_index() + +medest = ibes1.groupby(['ticker','fpedats', 'basis','repdats', 'act']).new_value.agg(['median','count']).reset_index() +medest = pd.merge(medest, grp_permno, how='inner', on=['ticker','fpedats','basis', 'repdats', 'act']) +medest = medest.rename(columns={'median': 'medest', 'count':'numest'}) + + +###################################### +# Step 5. Merge with Compustat Data # +###################################### + +# get items from fundq +fundq = conn.raw_sql(f""" + select gvkey, fyearq, fqtr, conm, datadate, rdq, epsfxq, epspxq, cshoq, prccq, + ajexq, spiq, cshoq, cshprq, cshfdq, saleq, atq, fyr, datafqtr, cshoq*prccq as mcap + from comp.fundq + where consol='C' and popsrc='D' and indfmt='INDL' and datafmt='STD' + and datadate between '{crsp_begdate}' and '{crsp_enddate}' + """, date_cols = ['datadate', 'datafqtr', 'rdq']) + +fundq = fundq.loc[((fundq.atq>0) | (fundq.saleq.notna())) & (fundq.datafqtr.notna())] + +# Calculate link date ranges for givken gvkey and ticker combination + +gvkey_mindt1 = gvkey.groupby(['gvkey', 'ticker']).linkdt.min().reset_index().rename(columns={'linkdt':'mindate'}) +gvkey_maxdt1 = gvkey.groupby(['gvkey', 'ticker']).linkenddt.max().reset_index().rename(columns={'linkenddt':'maxdate'}) +gvkey_dt1 = pd.merge(gvkey_mindt1, gvkey_maxdt1, how='inner', on=['gvkey','ticker']) + + +# Use the date range to merge +comp = pd.merge(fundq, gvkey_dt1, how='left', on =['gvkey']) +comp = comp.loc[(comp.ticker.notna()) & (comp.datadate<=comp.maxdate) & (comp.datadate>=comp.mindate)] + +# Merge with the median esitmates +comp = pd.merge(comp, medest, how = 'left', left_on=['ticker','datadate'], right_on=['ticker', 'fpedats']) + +# Sort data and drop duplicates +comp = comp.sort_values(by=['gvkey','fqtr','fyearq']).drop_duplicates() + + +########################### +# Step 6. Calculate SUEs # +########################### + +# block handling lag eps + +sue = comp.sort_values(by=['gvkey','fqtr','fyearq']) + +sue['dif_fyearq'] = sue.groupby(['gvkey', 'fqtr']).fyearq.diff() +sue['laggvkey'] = sue.gvkey.shift(1) + +# handling same qtr previous year + +cond_year = sue.dif_fyearq==1 # year increment is 1 + +sue['lagadj'] = np.where(cond_year, sue.ajexq.shift(1), None) +sue['lageps_p'] = np.where(cond_year, sue.epspxq.shift(1), None) +sue['lageps_d'] = np.where(cond_year, sue.epsfxq.shift(1), None) +sue['lagshr_p'] = np.where(cond_year, sue.cshprq.shift(1), None) +sue['lagshr_d'] = np.where(cond_year, sue.cshfdq.shift(1), None) +sue['lagspiq'] = np.where(cond_year, sue.spiq.shift(1), None) + +# handling first gvkey + +cond_gvkey = sue.gvkey != sue.laggvkey # first.gvkey + +sue['lagadj'] = np.where(cond_gvkey, None, sue.lagadj) +sue['lageps_p'] = np.where(cond_gvkey, None, sue.lageps_p) +sue['lageps_d'] = np.where(cond_gvkey, None, sue.lageps_d) +sue['lagshr_p'] = np.where(cond_gvkey, None, sue.lagshr_p) +sue['lagshr_d'] = np.where(cond_gvkey, None, sue.lagshr_d) +sue['lagspiq'] = np.where(cond_gvkey, None, sue.lagspiq) + + +# handling reporting basis + +# Basis = P and missing are treated the same + +sue['actual1'] = np.where(sue.basis=='D', sue.epsfxq/sue.ajexq, sue.epspxq/sue.ajexq) + +sue['actual2'] = np.where(sue.basis=='D', \ + (sue.epsfxq.fillna(0)-(0.65*sue.spiq/sue.cshfdq).fillna(0))/sue.ajexq, \ + (sue.epspxq.fillna(0)-(0.65*sue.spiq/sue.cshprq).fillna(0))/sue.ajexq + ) + +sue['expected1'] = np.where(sue.basis=='D', sue.lageps_d/sue.lagadj, sue.lageps_p/sue.lagadj) +sue['expected2'] = np.where(sue.basis=='D', \ + (sue.lageps_d.fillna(0)-(0.65*sue.lagspiq/sue.lagshr_d).fillna(0))/sue.lagadj, \ + (sue.lageps_p.fillna(0)-(0.65*sue.lagspiq/sue.lagshr_p).fillna(0))/sue.lagadj + ) + +# SUE calculations +sue['sue1'] = (sue.actual1 - sue.expected1) / (sue.prccq/sue.ajexq) +sue['sue2'] = (sue.actual2 - sue.expected2) / (sue.prccq/sue.ajexq) +sue['sue3'] = (sue.act - sue.medest) / sue.prccq + +sue = sue[['ticker','permno','gvkey','conm','fyearq','fqtr','fyr','datadate','repdats','rdq', \ + 'sue1','sue2','sue3','basis','act','medest','numest','prccq','mcap']] + + +# Shifting the announcement date to be the next trading day +# Defining the day after the following quarterly EA as leadrdq1 + +# unique rdq +uniq_rdq = comp[['rdq']].drop_duplicates() +uniq_rdq.shape + +# Create up to 5 days post rdq relative to rdq +for i in range(0, 5): + uniq_rdq[i] = uniq_rdq.rdq + datetime.timedelta(days=i) + +# reshape (transpose) for later join with crsp trading dates +expand_rdq = uniq_rdq.set_index('rdq').stack().reset_index().\ +rename(columns={'level_1':'post', 0:'post_date'}) + +# merge with crsp trading dates +eads1 = pd.merge(expand_rdq, crsp_dats, how='left', left_on=['post_date'], right_on=['date']) + +# create the dgap (days gap) variable for min selection +eads1['dgap'] = eads1.date-eads1.rdq +eads1 = eads1.loc[eads1.groupby('rdq')['dgap'].idxmin()].rename(columns={'date':'rdq1'}) + +# create sue_final +sue_final = pd.merge(sue, eads1[['rdq','rdq1']], how='left', on=['rdq']) +sue_final = sue_final.sort_values(by=['gvkey', 'fyearq','fqtr'], ascending=[True, False, False]).drop_duplicates() + +# Filter from Livnat & Mendenhall (2006): +#- earnings announcement date is reported in Compustat +#- the price per share is available from Compustat at fiscal quarter end +#- price is greater than $1 +#- the market (book) equity at fiscal quarter end is available and is +# EADs in Compustat and in IBES (if available)should not differ by more +# than one calendar day larger than $5 mil. + +sue_final['leadrdq1'] = sue_final.rdq1.shift(1) # next consecutive EAD +sue_final['leadgvkey'] = sue_final.gvkey.shift(1) + +# If first gvkey then leadrdq1 = rdq1+3 months +# Else leadrdq1 = previous rdq1 + +sue_final['leadrdq1'] = np.where(sue_final.gvkey == sue_final.leadgvkey, + sue_final.rdq1.shift(1), + sue_final.rdq1 + pd.offsets.MonthOffset(3)) + +sue_final['dgap'] = (sue_final.repdats - sue_final.rdq).fillna(0) +sue_final = sue_final.loc[(sue_final.rdq1 != sue_final.leadrdq1)] + +# Various conditioning for filtering +cond1 = (sue_final.sue1.notna()) & (sue_final.sue2.notna()) & (sue_final.repdats.isna()) +cond2 = (sue_final.repdats.notna()) & (sue_final.dgap<=datetime.timedelta(days=1)) & (sue_final.dgap>=datetime.timedelta(days=-1)) +sue_final = sue_final.loc[cond1 | cond2] + +# Impose restriction on price and marketcap +sue_final = sue_final.loc[(sue_final.rdq.notna()) & (sue_final.prccq>1) & (sue_final.mcap>5)] + +# Keep relevant columns +sue_final = sue_final[['gvkey', 'ticker','permno','conm',\ + 'fyearq','fqtr','datadate','fyr','rdq','rdq1','leadrdq1','repdats',\ + 'mcap','medest','act','numest','basis','sue1','sue2','sue3']] + + +######################################### +# Step 7. Form Portfolios Based on SUE # +######################################### + +# Extract file of raw daily returns around and between EADs and link them +# to Standardized Earnings Surprises for forming SUE-based portfolios + +# Records from dsf and dsi to calculate exret +dsf = conn.raw_sql(f""" + select permno, date, prc, abs(prc*shrout) as mcap, ret from crsp.dsf + where date between '{crsp_begdate}' and '{crsp_enddate}' + """, date_cols = ['date']) + +dsi = conn.raw_sql(f""" + select date, vwretd from crsp.dsi where date between '{crsp_begdate}' and '{crsp_enddate}' + """, date_cols=['date']) + +ds = pd.merge(dsf, dsi, how='left', on=['date']) +ds['exret'] = ds.ret - ds.vwretd +ds = ds.rename(columns={'vwretd':'mkt'}) + +# Records from sue_final that meet the condition +sue_final_join = sue_final.loc[(sue_final.rdq.notna()) & (sue_final.leadrdq1.notna()) & (sue_final.permno.notna()) \ + & (sue_final.leadrdq1-sue_final.rdq1>datetime.timedelta(days=30))] + +sue_final_join['lb_date'] = sue_final_join.rdq1-datetime.timedelta(days=5) +sue_final_join['ub_date'] = sue_final_join.leadrdq1+datetime.timedelta(days=5) + + +# left join ds with sue_final on permno first +# filter in the second step based on date range requirement +crsprets = pd.merge(ds, sue_final_join[['permno','rdq1', 'leadrdq1','sue1','sue2','sue3', 'lb_date','ub_date']], how='left', on=['permno']) + +# keep only records that meet the date range requirement +crsprets = crsprets.loc[(crsprets.date<=crsprets.ub_date) & (crsprets.date>=crsprets.lb_date)] +crsprets = crsprets.drop(['lb_date','ub_date'], axis=1) + + +# Alternative sql version to handle the join step of crsp return and sue_final +# Warning: sql runs very slow on python + +#import sqlite3 + +#sqlconn = sqlite3.connect(':memory') + +#sue_final_join.to_sql('sue_final_join_sql', sqlconn, index=False) +#ds.to_sql('ds_sql', sqlconn, index=False) + +#qry_stmt = """ +# select a.*, b.rdq1, b.leadrdq1, b.sue1, b.sue2, b.sue3 +# from ds_sql as a +# left join sue_final_join_sql as b +# on a.permno=b.permno and b.lb_date<=a.date<=b.ub_date +# """ + +#crsprets = pd.read_sql_query(qry_stmt, sqlconn) + +# To estimate the drift, sum daily returns over the period from +# 1 day after the earnings announcement through the day of +# the following quarterly earnings announcement + +temp = crsprets.sort_values(by=['permno', 'rdq1', 'date']) +temp['lpermno'] = temp.permno.shift(1) + +# If first permno then lagmcap = missing +# Else lagmcap = lag(mcap) +temp['lagmcap'] = np.where(temp.permno == temp.lpermno, + temp.mcap.shift(1), + None) + +temp = temp.loc[(temp.rdq1<=temp.date) & (temp.date<=temp.leadrdq1)] + +# create count variable within the group +temp['ncount'] = temp.groupby(['permno','rdq1']).cumcount() + +# Form quintiles based on SUE +peadrets = temp.sort_values(by=['ncount','permno','rdq1']).drop_duplicates() + +peadrets['sue1r']=peadrets.groupby('ncount')['sue1'].transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop')) +peadrets['sue2r']=peadrets.groupby('ncount')['sue2'].transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop')) +peadrets['sue3r']=peadrets.groupby('ncount')['sue3'].transform(lambda x: pd.qcut(x, 5, labels=False, duplicates='drop')) + +# Form portfolios on Compustat-based SUEs (=sue1 or =sue2) or IBES-based SUE (=sue3) +# Code uses sue3 + +peadrets3 = peadrets.loc[peadrets.sue3r.notna()].sort_values(by=['ncount', 'sue3']) +peadrets3['sue3r'] = peadrets3['sue3r'].astype(int) + +# Form value-weighted exret +# Calculate group weight sum; +grp_lagmcap = peadrets3.groupby(['ncount','sue3r']).lagmcap.sum().reset_index().rename(columns={'lagmcap':'total_lagmcap'}) + +# join group weight sum back to the df +peadrets3 = pd.merge(peadrets3, grp_lagmcap, how='left', on=['ncount','sue3r']) + +# vw exret +peadrets3['wt_exret'] = peadrets3.exret * peadrets3.lagmcap/peadrets3.total_lagmcap +peadsue3port = peadrets3.groupby(['ncount', 'sue3r']).wt_exret.sum().reset_index() + + +# set ncount=0 all five portfolio weighted returns to be 0 +peadsue3port['wt_exret'] = np.where(peadsue3port.ncount==0, 0, peadsue3port.wt_exret) + +# transpose table for cumulative return calculation +peadsue3port = peadsue3port.pivot_table(index=['ncount'], columns='sue3r') + +# reset column index level +peadsue3port.columns = [col[1] for col in peadsue3port.columns] +peadsue3port = peadsue3port.reset_index() + +# keep only first 50 days after EADs +peadsue3port = peadsue3port.loc[peadsue3port.ncount<=50] + +# Cumulating Excess Returns + +peadsue3port['sueport1'] = peadsue3port[0].cumsum() +peadsue3port['sueport2'] = peadsue3port[1].cumsum() +peadsue3port['sueport3'] = peadsue3port[2].cumsum() +peadsue3port['sueport4'] = peadsue3port[3].cumsum() +peadsue3port['sueport5'] = peadsue3port[4].cumsum() + + +################### +# End of Program # +################### diff --git a/pychars/accounting.py b/pychars/accounting.py new file mode 100755 index 0000000..c3f6e38 --- /dev/null +++ b/pychars/accounting.py @@ -0,0 +1,851 @@ +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import pickle as pkl + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +####################################################################################################################### +# TTM functions # +####################################################################################################################### + + +def ttm4(series, df): + """ + + :param series: variables' name + :param df: dataframe + :return: ttm4 + """ + lag = pd.DataFrame() + for i in range(1, 4): + lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('gvkey')['%s' % series].shift(i) + result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] + return result + + +def ttm12(series, df): + """ + + :param series: variables' name + :param df: dataframe + :return: ttm12 + """ + lag = pd.DataFrame() + for i in range(1, 12): + lag['%(series)s%(lag)s' % {'series': series, 'lag': i}] = df.groupby('gvkey')['%s' % series].shift(i) + result = df['%s' % series] + lag['%s1' % series] + lag['%s2' % series] + lag['%s3' % series] +\ + lag['%s4' % series] + lag['%s5' % series] + lag['%s6' % series] + lag['%s7' % series] +\ + lag['%s8' % series] + lag['%s9' % series] + lag['%s10' % series] + lag['%s11' % series] + return result + + +####################################################################################################################### +# Compustat Block # +####################################################################################################################### +comp = conn.raw_sql(""" + /*header info*/ + select c.gvkey, f.cusip, f.datadate, f.fyear, c.cik, substr(c.sic,1,2) as sic2, c.sic, c.naics, + + /*firm variables*/ + /*income statement*/ + f.sale, f.revt, f.cogs, f.xsga, f.dp, f.xrd, f.xad, f.ib, f.ebitda, + f.ebit, f.nopi, f.spi, f.pi, f.txp, f.ni, f.txfed, f.txfo, f.txt, f.xint, + + /*CF statement and others*/ + f.capx, f.oancf, f.dvt, f.ob, f.gdwlia, f.gdwlip, f.gwo, f.mib, f.oiadp, f.ivao, + + /*assets*/ + f.rect, f.act, f.che, f.ppegt, f.invt, f.at, f.aco, f.intan, f.ao, f.ppent, f.gdwl, f.fatb, f.fatl, + + /*liabilities*/ + f.lct, f.dlc, f.dltt, f.lt, f.dm, f.dcvt, f.cshrc, + f.dcpstk, f.pstk, f.ap, f.lco, f.lo, f.drc, f.drlt, f.txdi, + + /*equity and other*/ + f.ceq, f.scstkc, f.emp, f.csho, f.seq, f.txditc, f.pstkrv, f.pstkl, f.np, f.txdc, f.dpc, f.ajex, + + /*market*/ + abs(f.prcc_f) as prcc_f + + from comp.funda as f + left join comp.company as c + on f.gvkey = c.gvkey + + /*get consolidated, standardized, industrial format statements*/ + where f.indfmt = 'INDL' + and f.datafmt = 'STD' + and f.popsrc = 'D' + and f.consol = 'C' + and f.datadate >= '01/01/1959' + """) + +# convert datadate to date fmt +comp['datadate'] = pd.to_datetime(comp['datadate']) + +# sort and clean up +comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() + +# clean up csho +comp['csho'] = np.where(comp['csho'] == 0, np.nan, comp['csho']) + +# calculate Compustat market equity +comp['mve_f'] = comp['csho'] * comp['prcc_f'] + +# do some clean up. several variables have lots of missing values +condlist = [comp['drc'].notna() & comp['drlt'].notna(), + comp['drc'].notna() & comp['drlt'].isnull(), + comp['drlt'].notna() & comp['drc'].isnull()] +choicelist = [comp['drc']+comp['drlt'], + comp['drc'], + comp['drlt']] +comp['dr'] = np.select(condlist, choicelist, default=np.nan) + +condlist = [comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].notna() & comp['dcpstk'] > comp['pstk'], + comp['dcvt'].isnull() & comp['dcpstk'].notna() & comp['pstk'].isnull()] +choicelist = [comp['dcpstk']-comp['pstk'], + comp['dcpstk']] +comp['dc'] = np.select(condlist, choicelist, default=np.nan) +comp['dc'] = np.where(comp['dc'].isnull(), comp['dcvt'], comp['dc']) + +comp['xint0'] = np.where(comp['xint'].isnull(), 0, comp['xint']) +comp['xsga0'] = np.where(comp['xsga'].isnull, 0, 0) + +comp['ceq'] = np.where(comp['ceq'] == 0, np.nan, comp['ceq']) +comp['at'] = np.where(comp['at'] == 0, np.nan, comp['at']) +comp = comp.dropna(subset=['at']) + +####################################################################################################################### +# CRSP Block # +####################################################################################################################### +# Create a CRSP Subsample with Monthly Stock and Event Variables +# Restrictions will be applied later +# Select variables from the CRSP monthly stock and event datasets +crsp = conn.raw_sql(""" + select a.prc, a.ret, a.retx, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.date, a.permno, a.permco, + b.ticker, b.ncusip, b.shrcd, b.exchcd + from crsp.msf as a + left join crsp.msenames as b + on a.permno=b.permno + and b.namedt<=a.date + and a.date<=b.nameendt + where a.date >= '01/01/1959' + and b.exchcd between 1 and 3 + """) + +# change variable format to int +crsp[['permco', 'permno', 'shrcd', 'exchcd']] = crsp[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) +crsp['monthend'] = crsp['date'] + MonthEnd(0) # set all the date to the standard end date of month + +crsp = crsp.dropna(subset=['prc']) +crsp['me'] = crsp['prc'].abs() * crsp['shrout'] # calculate market equity + +# if Market Equity is Nan then let return equals to 0 +crsp['ret'] = np.where(crsp['me'].isnull(), 0, crsp['ret']) +crsp['retx'] = np.where(crsp['me'].isnull(), 0, crsp['retx']) + +# impute me +crsp = crsp.sort_values(by=['permno', 'date']).drop_duplicates() +crsp['me'] = np.where(crsp['permno'] == crsp['permno'].shift(1), crsp['me'].fillna(method='ffill'), crsp['me']) + +# Aggregate Market Cap +''' +There are cases when the same firm (permco) has two or more securities (permno) at same date. +For the purpose of ME for the firm, we aggregated all ME for a given permco, date. +This aggregated ME will be assigned to the permno with the largest ME. +''' +# sum of me across different permno belonging to same permco a given date +crsp_summe = crsp.groupby(['monthend', 'permco'])['me'].sum().reset_index() +# largest mktcap within a permco/date +crsp_maxme = crsp.groupby(['monthend', 'permco'])['me'].max().reset_index() +# join by monthend/maxme to find the permno +crsp1 = pd.merge(crsp, crsp_maxme, how='inner', on=['monthend', 'permco', 'me']) +# drop me column and replace with the sum me +crsp1 = crsp1.drop(['me'], axis=1) +# join with sum of me to get the correct market cap info +crsp2 = pd.merge(crsp1, crsp_summe, how='inner', on=['monthend', 'permco']) +# sort by permno and date and also drop duplicates +crsp2 = crsp2.sort_values(by=['permno', 'monthend']).drop_duplicates() + +####################################################################################################################### +# CCM Block # +####################################################################################################################### +# merge CRSP and Compustat +# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/linking-databases/linking-crsp-and-compustat/ +ccm = conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where substr(linktype,1,1)='L' + and (linkprim ='C' or linkprim='P') + """) + +ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) +ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) + +# if linkenddt is missing then set to today date +ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) + +# merge ccm and comp +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) + +# we can only get the accounting data after the firm public their report +# for annual data, we ues 6 months lagged data +ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) +ccm1['jdate'] = ccm1['yearend'] + MonthEnd(6) + +# set link date bounds +ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] + +# link comp and crsp +crsp2 = crsp2.rename(columns={'monthend': 'jdate'}) +data_rawa = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) + +# filter exchcd & shrcd +data_rawa = data_rawa[((data_rawa['exchcd'] == 1) | (data_rawa['exchcd'] == 2) | (data_rawa['exchcd'] == 3)) & + ((data_rawa['shrcd'] == 10) | (data_rawa['shrcd'] == 11))] + +# process Market Equity +''' +Note: me is CRSP market equity, mve_f is Compustat market equity. Please choose the me below. +''' +data_rawa['me'] = data_rawa['me']/1000 # CRSP ME +# data_rawa['me'] = data_rawa['mve_f'] # Compustat ME + +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawa['me'] = np.where(data_rawa['me'] == 0, np.nan, data_rawa['me']) +data_rawa = data_rawa.dropna(subset=['me']) + +# count single stock years +# data_rawa['count'] = data_rawa.groupby(['gvkey']).cumcount() + +# deal with the duplicates +data_rawa.loc[data_rawa.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 +data_rawa = data_rawa[data_rawa['temp'].notna()] +data_rawa.loc[data_rawa.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 +data_rawa = data_rawa[data_rawa['temp'].notna()] + +data_rawa = data_rawa.sort_values(by=['permno', 'jdate']) + +####################################################################################################################### +# Annual Variables # +####################################################################################################################### +# preferrerd stock +data_rawa['ps'] = np.where(data_rawa['pstkrv'].isnull(), data_rawa['pstkl'], data_rawa['pstkrv']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), data_rawa['pstk'], data_rawa['ps']) +data_rawa['ps'] = np.where(data_rawa['ps'].isnull(), 0, data_rawa['ps']) + +data_rawa['txditc'] = data_rawa['txditc'].fillna(0) + +# book equity +data_rawa['be'] = data_rawa['seq'] + data_rawa['txditc'] - data_rawa['ps'] +data_rawa['be'] = np.where(data_rawa['be'] > 0, data_rawa['be'], np.nan) + +# acc +data_rawa['act_l1'] = data_rawa.groupby(['permno'])['act'].shift(1) +data_rawa['lct_l1'] = data_rawa.groupby(['permno'])['lct'].shift(1) + +condlist = [data_rawa['np'].isnull(), + data_rawa['act'].isnull() | data_rawa['lct'].isnull()] +choicelist = [((data_rawa['act']-data_rawa['lct'])-(data_rawa['act_l1']-data_rawa['lct_l1'])/(10*data_rawa['be'])), + (data_rawa['ib']-data_rawa['oancf'])/(10*data_rawa['be'])] +data_rawa['acc'] = np.select(condlist, + choicelist, + default=((data_rawa['act']-data_rawa['lct']+data_rawa['np'])- + (data_rawa['act_l1']-data_rawa['lct_l1']+data_rawa['np'].shift(1)))/(10*data_rawa['be'])) + +# agr +data_rawa['at_l1'] = data_rawa.groupby(['permno'])['at'].shift(1) +data_rawa['agr'] = (data_rawa['at']-data_rawa['at_l1'])/data_rawa['at_l1'] + +# bm +data_rawa['bm'] = data_rawa['be'] / data_rawa['me'] +data_rawa['bm_n'] = data_rawa['be'] + +# cfp +condlist = [data_rawa['dp'].isnull(), + data_rawa['ib'].isnull()] +choicelist = [data_rawa['ib']/data_rawa['me'], + np.nan] +data_rawa['cfp'] = np.select(condlist, choicelist, default=(data_rawa['ib']+data_rawa['dp'])/data_rawa['me']) + +condlist = [data_rawa['dp'].isnull(), + data_rawa['ib'].isnull()] +choicelist = [data_rawa['ib'], + np.nan] +data_rawa['cfp_n'] = np.select(condlist, choicelist, default=data_rawa['ib']+data_rawa['dp']) + +# ep +data_rawa['ep'] = data_rawa['ib']/data_rawa['me'] +data_rawa['ep_n'] = data_rawa['ib'] + +# ni +data_rawa['csho_l1'] = data_rawa.groupby(['permno'])['csho'].shift(1) +data_rawa['ajex_l1'] = data_rawa.groupby(['permno'])['ajex'].shift(1) +data_rawa['ni'] = np.where(data_rawa['gvkey'] != data_rawa['gvkey'].shift(1), + np.nan, + np.log(data_rawa['csho']*data_rawa['ajex']).replace(-np.inf, 0)- + np.log(data_rawa['csho_l1']*data_rawa['ajex_l1']).replace(-np.inf, 0)) + +# op +data_rawa['cogs0'] = np.where(data_rawa['cogs'].isnull(), 0, data_rawa['cogs']) +data_rawa['xint0'] = np.where(data_rawa['xint'].isnull(), 0, data_rawa['xint']) +data_rawa['xsga0'] = np.where(data_rawa['xsga'].isnull(), 0, data_rawa['xsga']) + +condlist = [data_rawa['revt'].isnull(), data_rawa['be'].isnull()] +choicelist = [np.nan, np.nan] +data_rawa['op'] = np.select(condlist, choicelist, + default=(data_rawa['revt'] - data_rawa['cogs0'] - data_rawa['xsga0'] - data_rawa['xint0'])/data_rawa['be']) + +# rsup +data_rawa['sale_l1'] = data_rawa.groupby(['permno'])['sale'].shift(1) +data_rawa['rsup'] = (data_rawa['sale']-data_rawa['sale_l1'])/data_rawa['me'] + +# sue +# data_rawa['ib_l1'] = data_rawa.groupby(['permno'])['ib'].shift(1) +# data_rawa['sue'] = (data_rawa['ib']-data_rawa['ib_l1'])/data_rawa['me'] + +# cash +data_rawa['cash'] = data_rawa['che']/data_rawa['at'] + +# lev +data_rawa['lev'] = data_rawa['lt']/data_rawa['me'] + +# sp +data_rawa['sp'] = data_rawa['sale']/data_rawa['me'] +data_rawa['sp_n'] = data_rawa['sale'] + +# rd_sale +data_rawa['rd_sale'] = data_rawa['xrd']/data_rawa['sale'] + +# rdm +data_rawa['rdm'] = data_rawa['xrd']/data_rawa['me'] + +# adm hxz adm +data_rawa['adm'] = data_rawa['xad']/data_rawa['me'] + +# gma +data_rawa['gma'] = (data_rawa['revt']-data_rawa['cogs'])/data_rawa['at_l1'] + +# chcsho +data_rawa['chcsho'] = (data_rawa['csho']/data_rawa['csho_l1'])-1 + +# lgr +data_rawa['lt_l1'] = data_rawa.groupby(['permno'])['lt'].shift(1) +data_rawa['lgr'] = (data_rawa['lt']/data_rawa['lt_l1'])-1 + +# pctacc +data_rawa['che_l1'] = data_rawa.groupby(['permno'])['che'].shift(1) +data_rawa['dlc_l1'] = data_rawa.groupby(['permno'])['dlc'].shift(1) +data_rawa['txp_l1'] = data_rawa.groupby(['permno'])['txp'].shift(1) + +condlist = [data_rawa['ib']==0, + data_rawa['oancf'].isnull(), + data_rawa['oancf'].isnull() & data_rawa['ib']==0] +choicelist = [(data_rawa['ib']-data_rawa['oancf'])/0.01, + ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1']))- + ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1']- + ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))/data_rawa['ib'].abs(), + ((data_rawa['act'] - data_rawa['act_l1']) - (data_rawa['che'] - data_rawa['che_l1'])) - + ((data_rawa['lct'] - data_rawa['lct_l1']) - (data_rawa['dlc']) - data_rawa['dlc_l1'] - + ((data_rawa['txp'] - data_rawa['txp_l1']) - data_rawa['dp']))] +data_rawa['pctacc'] = np.select(condlist, choicelist, default=(data_rawa['ib']-data_rawa['oancf'])/data_rawa['ib'].abs()) + +# age +# data_rawa['age'] = data_rawa['count'] + +# sgr +data_rawa['sgr'] = (data_rawa['sale']/data_rawa['sale_l1'])-1 + +# chpm +# data_rawa['chpm'] = (data_rawa['ib']/data_rawa['sale'])-(data_rawa['ib_l1']/data_rawa['sale_l1']) + +# chato +data_rawa['at_l2'] = data_rawa.groupby(['permno'])['at'].shift(2) +data_rawa['chato'] = (data_rawa['sale']/((data_rawa['at']+data_rawa['at_l1'])/2))-\ + (data_rawa['sale_l1']/((data_rawa['at']+data_rawa['at_l2'])/2)) + +# chtx +data_rawa['txt_l1'] = data_rawa.groupby(['permno'])['txt'].shift(1) +data_rawa['chtx'] = (data_rawa['txt']-data_rawa['txt_l1'])/data_rawa['at_l1'] + +# ala +# data_rawa['ala'] = data_rawa['che']+0.75*(data_rawa['act']-data_rawa['che'])-\ +# 0.5*(data_rawa['at']-data_rawa['act']-data_rawa['gdwl']-data_rawa['intan']) + +# alm +# data_rawa['alm'] = data_rawa['ala']/(data_rawa['at']+data_rawa['prcc_f']*data_rawa['csho']-data_rawa['ceq']) + +# noa +data_rawa['noa'] = ((data_rawa['at']-data_rawa['che']-data_rawa['ivao'].fillna(0))- + (data_rawa['at']-data_rawa['dlc'].fillna(0)-data_rawa['dltt'].fillna(0)-data_rawa['mib'].fillna(0) + -data_rawa['pstk'].fillna(0)-data_rawa['ceq'])/data_rawa['at_l1']) + +# rna +data_rawa['noa_l1'] = data_rawa.groupby(['permno'])['noa'].shift(1) +data_rawa['rna'] = data_rawa['oiadp']/data_rawa['noa_l1'] + +# pm +data_rawa['pm'] = data_rawa['oiadp']/data_rawa['sale'] + +# ato +data_rawa['ato'] = data_rawa['sale']/data_rawa['noa_l1'] + +# depr +data_rawa['depr'] = data_rawa['dp']/data_rawa['ppent'] + +# invest +data_rawa['ppent_l1'] = data_rawa.groupby(['permno'])['ppent'].shift(1) +data_rawa['invt_l1'] = data_rawa.groupby(['permno'])['invt'].shift(1) + +data_rawa['invest'] = np.where(data_rawa['ppegt'].isnull(), ((data_rawa['ppent']-data_rawa['ppent_l1'])+ + (data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1'], + ((data_rawa['ppegt']-data_rawa['ppent_l1'])+(data_rawa['invt']-data_rawa['invt_l1']))/data_rawa['at_l1']) + +# egr +data_rawa['ceq_l1'] = data_rawa.groupby(['permno'])['ceq'].shift(1) +data_rawa['egr'] = ((data_rawa['ceq']-data_rawa['ceq_l1'])/data_rawa['ceq_l1']) + +# cashdebt +data_rawa['cashdebt'] = (data_rawa['ib']+data_rawa['dp'])/((data_rawa['lt']+data_rawa['lt_l1'])/2) + +# # grltnoa +# lag_a['aco'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['aco'].shift(1), np.nan) +# lag_a['intan'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['intan'].shift(1), np.nan) +# lag_a['ao'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['ao'].shift(1), np.nan) +# lag_a['ap'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['ap'].shift(1), np.nan) +# lag_a['lco'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['lco'].shift(1), np.nan) +# lag_a['lo'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['lo'].shift(1), np.nan) +# lag_a['rect'] = np.where(data_rawa['gvkey'] == data_rawa['gvkey'].shift(1), data_rawa['rect'].shift(1), np.nan) +# +# data_rawa['grltnoa'] = ((data_rawa['rect']+data_rawa['invt']+data_rawa['ppent']+data_rawa['aco']+data_rawa['intan']+ +# data_rawa['ao']-data_rawa['ap']-data_rawa['lco']-data_rawa['lo'])- +# (lag_a['rect']+lag_a['invt']+lag_a['ppent']+lag_a['aco']+lag_a['intan']+lag_a['ao']-lag_a['ap']- +# lag_a['lco']-lag_a['lo'])-\ +# (data_rawa['rect']-lag_a['rect']+data_rawa['invt']-lag_a['invt']+data_rawa['aco']-lag_a['aco']- +# (data_rawa['ap']-lag_a['ap']+data_rawa['lco']-lag_a['lco'])-data_rawa['dp']))/((data_rawa['at']+lag_a['at'])/2) + +# rd +# if ((xrd/at)-(lag(xrd/lag(at))))/(lag(xrd/lag(at))) >.05 then rd=1; else rd=0; +data_rawa['xrd/at_l1'] = data_rawa['xrd']/data_rawa['at_l1'] +data_rawa['xrd/at_l1_l1'] = data_rawa.groupby(['permno'])['xrd/at_l1'].shift(1) +data_rawa['rd'] = np.where(((data_rawa['xrd']/data_rawa['at'])- + (data_rawa['xrd/at_l1_l1']))/data_rawa['xrd/at_l1_l1']>0.05, 1, 0) + +# roa +data_rawa['roa'] = data_rawa['ni']/((data_rawa['at']+data_rawa['at_l1'])/2) + +# roe +data_rawa['roe'] = data_rawa['ib']/data_rawa['ceq_l1'] + +# dy +data_rawa['dy'] = data_rawa['dvt']/data_rawa['me'] + +# Annual Accounting Variables +chars_a = data_rawa[['cusip', 'ncusip', 'gvkey', 'permno', 'exchcd', 'shrcd', 'datadate', 'jdate', + 'sic', 'acc', 'agr', 'bm', 'cfp', 'ep', 'ni', 'op', 'rsup', 'cash', 'chcsho', + 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', 'rdm', 'adm', 'sgr', 'sp', 'invest', 'roe', + 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'chato', 'chtx', 'noa', 'rna', 'pm', 'ato', 'dy']] +chars_a.reset_index(drop=True, inplace=True) +####################################################################################################################### +# Compustat Quarterly Raw Info # +####################################################################################################################### +comp = conn.raw_sql(""" + /*header info*/ + select c.gvkey, f.cusip, f.datadate, f.fyearq, substr(c.sic,1,2) as sic2, c.sic, f.fqtr, f.rdq, + + /*income statement*/ + f.ibq, f.saleq, f.txtq, f.revtq, f.cogsq, f.xsgaq, f.revty, f.cogsy, f.saley, + + /*balance sheet items*/ + f.atq, f.actq, f.cheq, f.lctq, f.dlcq, f.ppentq, f.ppegtq, + + /*others*/ + abs(f.prccq) as prccq, abs(f.prccq)*f.cshoq as mveq_f, f.ceqq, f.seqq, f.pstkq, f.ltq, + f.pstkrq, f.gdwlq, f.intanq, f.mibq, f.oiadpq, f.ivaoq, + + /* v3 my formula add*/ + f.ajexq, f.cshoq, f.txditcq, f.npq, f.xrdy, f.xrdq, f.dpq, f.xintq, f.invtq, f.scstkcy, f.niq, + f.oancfy, f.dlttq + + from comp.fundq as f + left join comp.company as c + on f.gvkey = c.gvkey + + /*get consolidated, standardized, industrial format statements*/ + where f.indfmt = 'INDL' + and f.datafmt = 'STD' + and f.popsrc = 'D' + and f.consol = 'C' + and f.datadate >= '01/01/1959' + """) + +# comp['cusip6'] = comp['cusip'].str.strip().str[0:6] +comp = comp.dropna(subset=['ibq']) + +# sort and clean up +comp = comp.sort_values(by=['gvkey', 'datadate']).drop_duplicates() +comp['cshoq'] = np.where(comp['cshoq'] == 0, np.nan, comp['cshoq']) +comp['ceqq'] = np.where(comp['ceqq'] == 0, np.nan, comp['ceqq']) +comp['atq'] = np.where(comp['atq'] == 0, np.nan, comp['atq']) +comp = comp.dropna(subset=['atq']) + +# convert datadate to date fmt +comp['datadate'] = pd.to_datetime(comp['datadate']) + +# merge ccm and comp +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) +ccm1['yearend'] = ccm1['datadate'] + YearEnd(0) +ccm1['jdate'] = ccm1['datadate'] + MonthEnd(3) # we change quarterly lag here +# ccm1['jdate'] = ccm1['datadate']+MonthEnd(4) + +# set link date bounds +ccm2 = ccm1[(ccm1['jdate'] >= ccm1['linkdt']) & (ccm1['jdate'] <= ccm1['linkenddt'])] + +# merge ccm2 and crsp2 +# crsp2['jdate'] = crsp2['monthend'] +data_rawq = pd.merge(crsp2, ccm2, how='inner', on=['permno', 'jdate']) + +# filter exchcd & shrcd +data_rawq = data_rawq[((data_rawq['exchcd'] == 1) | (data_rawq['exchcd'] == 2) | (data_rawq['exchcd'] == 3)) & + ((data_rawq['shrcd'] == 10) | (data_rawq['shrcd'] == 11))] + +# process Market Equity +''' +Note: me is CRSP market equity, mveq_f is Compustat market equity. Please choose the me below. +''' +data_rawq['me'] = data_rawq['me']/1000 # CRSP ME +# data_rawq['me'] = data_rawq['mveq_f'] # Compustat ME + +# there are some ME equal to zero since this company do not have price or shares data, we drop these observations +data_rawq['me'] = np.where(data_rawq['me'] == 0, np.nan, data_rawq['me']) +data_rawq = data_rawq.dropna(subset=['me']) + +# count single stock years +# data_rawq['count'] = data_rawq.groupby(['gvkey']).cumcount() + +# deal with the duplicates +data_rawq.loc[data_rawq.groupby(['datadate', 'permno', 'linkprim'], as_index=False).nth([0]).index, 'temp'] = 1 +data_rawq = data_rawq[data_rawq['temp'].notna()] +data_rawq.loc[data_rawq.groupby(['permno', 'yearend', 'datadate'], as_index=False).nth([-1]).index, 'temp'] = 1 +data_rawq = data_rawq[data_rawq['temp'].notna()] + +data_rawq = data_rawq.sort_values(by=['permno', 'jdate']) + +####################################################################################################################### +# Quarterly Variables # +####################################################################################################################### +# prepare be +data_rawq['beq'] = np.where(data_rawq['seqq']>0, data_rawq['seqq']+data_rawq['txditcq']-data_rawq['pstkq'], np.nan) +data_rawq['beq'] = np.where(data_rawq['beq']<=0, np.nan, data_rawq['beq']) + +# dy +data_rawq['me_l1'] = data_rawq.groupby(['permno'])['me'].shift(1) +data_rawq['retdy'] = data_rawq['ret'] - data_rawq['retx'] +data_rawq['mdivpay'] = data_rawq['retdy']*data_rawq['me_l1'] + +data_rawq['dy'] = ttm12(series='mdivpay', df=data_rawq)/data_rawq['me'] + +# # pstk +# chars_q['pstk'] = np.where(data_rawq['pstkrq'].notna(), data_rawq['pstkrq'], data_rawq['pstkq']) +# +# # scal +# condlist = [data_rawq['seqq'].isnull(), +# data_rawq['seqq'].isnull() & (data_rawq['ceqq'].isnull() | chars_q['pstk'].isnull())] +# choicelist = [data_rawq['ceqq']+chars_q['pstk'], +# data_rawq['atq']-data_rawq['ltq']] +# chars_q['scal'] = np.select(condlist, choicelist, default=data_rawq['seqq']) + +# chtx +data_rawq['txtq_l4'] = data_rawq.groupby(['permno'])['txtq'].shift(4) +data_rawq['atq_l4'] = data_rawq.groupby(['permno'])['atq'].shift(4) +data_rawq['chtx'] = (data_rawq['txtq']-data_rawq['txtq_l4'])/data_rawq['atq_l4'] + +# roa +data_rawq['atq_l1'] = data_rawq.groupby(['permno'])['atq'].shift(1) +data_rawq['roa'] = data_rawq['ibq']/data_rawq['atq_l1'] + +# cash +data_rawq['cash'] = data_rawq['cheq']/data_rawq['atq'] + +# acc +data_rawq['actq_l4'] = data_rawq.groupby(['permno'])['actq'].shift(4) +data_rawq['lctq_l4'] = data_rawq.groupby(['permno'])['lctq'].shift(4) +data_rawq['npq_l4'] = data_rawq.groupby(['permno'])['npq'].shift(4) +condlist = [data_rawq['npq'].isnull(), + data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] +choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/(10*data_rawq['beq']), + np.nan] +data_rawq['acc'] = np.select(condlist, choicelist, + default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])- + (data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/(10*data_rawq['beq'])) + +# bm +data_rawq['bm'] = data_rawq['beq']/data_rawq['me'] +data_rawq['bm_n'] = data_rawq['beq'] + +# cfp +data_rawq['cfp'] = np.where(data_rawq['dpq'].isnull(), + ttm4('ibq', data_rawq)/data_rawq['me'], + (ttm4('ibq', data_rawq)+ttm4('dpq', data_rawq))/data_rawq['me']) +data_rawq['cfp_n'] = data_rawq['cfp']*data_rawq['me'] + +# ep +data_rawq['ep'] = ttm4('ibq', data_rawq)/data_rawq['me'] +data_rawq['ep_n'] = data_rawq['ep']*data_rawq['me'] + +# agr +data_rawq['agr'] = (data_rawq['atq']-data_rawq['atq_l4'])/data_rawq['atq_l4'] + +# ni +data_rawq['cshoq_l4'] = data_rawq.groupby(['permno'])['cshoq'].shift(4) +data_rawq['ajexq_l4'] = data_rawq.groupby(['permno'])['ajexq'].shift(4) +data_rawq['ni'] = np.where(data_rawq['cshoq'].isnull(), np.nan, + np.log(data_rawq['cshoq']*data_rawq['ajexq']).replace(-np.inf, 0)-np.log(data_rawq['cshoq_l4']*data_rawq['ajexq_l4'])) + +# op +data_rawq['xintq0'] = np.where(data_rawq['xintq'].isnull(), 0, data_rawq['xintq']) +data_rawq['xsgaq0'] = np.where(data_rawq['xsgaq'].isnull(), 0, data_rawq['xsgaq']) +data_rawq['beq_l4'] = data_rawq.groupby(['permno'])['beq'].shift(4) + +data_rawq['op'] = (ttm4('revtq', data_rawq)-ttm4('cogsq', data_rawq)-ttm4('xsgaq0', data_rawq)-ttm4('xintq0', data_rawq))/data_rawq['beq_l4'] + +# sue +# data_rawq['ibq_l4'] = data_rawq.groupby(['permno'])['ibq'].shift(4) +# data_rawq['sue'] = (data_rawq['ibq']-data_rawq['ibq_l4'])/data_rawq['me'].abs() + +# csho +data_rawq['chcsho'] = (data_rawq['cshoq']/data_rawq['cshoq_l4'])-1 + +# cashdebt +data_rawq['ltq_l4'] = data_rawq.groupby(['permno'])['ltq'].shift(4) +data_rawq['cashdebt'] = (ttm4('ibq', data_rawq) + ttm4('dpq', data_rawq))/((data_rawq['ltq']+data_rawq['ltq_l4'])/2) + +# rd +data_rawq['xrdq4'] = ttm4('xrdq', data_rawq) +data_rawq['xrdq4'] = np.where(data_rawq['xrdq4'].isnull(), data_rawq['xrdy'], data_rawq['xrdq4']) + +data_rawq['xrdq4/atq_l4'] = data_rawq['xrdq4']/data_rawq['atq_l4'] +data_rawq['xrdq4/atq_l4_l4'] = data_rawq.groupby(['permno'])['xrdq4/atq_l4'].shift(4) +data_rawq['rd'] = np.where(((data_rawq['xrdq4']/data_rawq['atq'])-data_rawq['xrdq4/atq_l4_l4'])/data_rawq['xrdq4/atq_l4_l4']>0.05, 1, 0) + +# pctacc +condlist = [data_rawq['npq'].isnull(), + data_rawq['actq'].isnull() | data_rawq['lctq'].isnull()] +choicelist = [((data_rawq['actq']-data_rawq['lctq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']))/abs(ttm4('ibq', data_rawq)), np.nan] +data_rawq['pctacc'] = np.select(condlist, choicelist, + default=((data_rawq['actq']-data_rawq['lctq']+data_rawq['npq'])-(data_rawq['actq_l4']-data_rawq['lctq_l4']+data_rawq['npq_l4']))/ + abs(ttm4('ibq', data_rawq))) + +# gma +data_rawq['revtq4'] = ttm4('revtq', data_rawq) +data_rawq['cogsq4'] = ttm4('cogsq', data_rawq) +data_rawq['gma'] = (data_rawq['revtq4']-data_rawq['cogsq4'])/data_rawq['atq_l4'] + +# lev +data_rawq['lev'] = data_rawq['ltq']/data_rawq['me'] + +# rdm +data_rawq['rdm'] = data_rawq['xrdq4']/data_rawq['me'] + +# sgr +data_rawq['saleq4'] = ttm4('saleq', data_rawq) +data_rawq['saleq4'] = np.where(data_rawq['saleq4'].isnull(), data_rawq['saley'], data_rawq['saleq4']) + +data_rawq['saleq4_l4'] = data_rawq.groupby(['permno'])['saleq4'].shift(4) +data_rawq['sgr'] = (data_rawq['saleq4']/data_rawq['saleq4_l4'])-1 + +# sp +data_rawq['sp'] = data_rawq['saleq4']/data_rawq['me'] +data_rawq['sp_n'] = data_rawq['saleq4'] + +# invest +data_rawq['ppentq_l4'] = data_rawq.groupby(['permno'])['ppentq'].shift(4) +data_rawq['invtq_l4'] = data_rawq.groupby(['permno'])['invtq'].shift(4) +data_rawq['ppegtq_l4'] = data_rawq.groupby(['permno'])['ppegtq'].shift(4) + +data_rawq['invest'] = np.where(data_rawq['ppegtq'].isnull(), ((data_rawq['ppentq']-data_rawq['ppentq_l4'])+ + (data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4'], + ((data_rawq['ppegtq']-data_rawq['ppegtq_l4'])+(data_rawq['invtq']-data_rawq['invtq_l4']))/data_rawq['atq_l4']) + +# rd_sale +data_rawq['rd_sale'] = data_rawq['xrdq4']/data_rawq['saleq4'] + +# lgr +data_rawq['lgr'] = (data_rawq['ltq']/data_rawq['ltq_l4'])-1 + +# depr +data_rawq['depr'] = ttm4('dpq', data_rawq)/data_rawq['ppentq'] + +# egr +data_rawq['ceqq_l4'] = data_rawq.groupby(['permno'])['ceqq'].shift(4) +data_rawq['egr'] = (data_rawq['ceqq']-data_rawq['ceqq_l4'])/data_rawq['ceqq_l4'] + +# grltnoa +# lag_q['rectq4'] = np.where(data_rawq['gvkey'] == data_rawq['gvkey'].shift(4), data_rawq['rectq'].shift(4), np.nan) +# lag_q['acoq4'] = np.where(data_rawq['gvkey'] == data_rawq['gvkey'].shift(4), data_rawq['acoq'].shift(4), np.nan) +# lag_q['apq4'] = np.where(data_rawq['gvkey'] == data_rawq['gvkey'].shift(4), data_rawq['apq'].shift(4), np.nan) +# lag_q['lcoq4'] = np.where(data_rawq['gvkey'] == data_rawq['gvkey'].shift(4), data_rawq['lcoq'].shift(4), np.nan) +# lag_q['loq4'] = np.where(data_rawq['gvkey'] == data_rawq['gvkey'].shift(4), data_rawq['loq'].shift(4), np.nan) +# +# chars_q['grltnoa'] = ((data_rawq['rectq']+data_rawq['invtq']+data_rawq['ppentq']+data_rawq['acoq']+data_rawq['intanq']+ +# data_rawq['aoq']-data_rawq['apq']-data_rawq['lcoq']-data_rawq['loq'])- +# (lag_q['rectq4']+lag_q['invtq4']+lag_q['ppentq4']+lag_q['acoq4']-lag_q['apq4']-lag_q['lcoq4']-lag_q['loq4'])-\ +# (data_rawq['rectq']-lag_q['rectq4']+data_rawq['invtq']-lag_q['invtq4']+data_rawq['acoq']- +# (data_rawq['apq']-lag_q['apq4']+data_rawq['lcoq']-lag_q['lcoq4'])- +# ttm4('dpq', data_rawq)))/((data_rawq['atq']+lag_q['atq4'])/2) + +# chpm +data_rawq['ibq4'] = ttm4('ibq', data_rawq) +data_rawq['ibq4_l1'] = data_rawq.groupby(['permno'])['ibq4'].shift(1) +data_rawq['saleq4_l1'] = data_rawq.groupby(['permno'])['saleq4'].shift(1) + +data_rawq['chpm'] = (data_rawq['ibq4']/data_rawq['saleq4'])-(data_rawq['ibq4_l1']/data_rawq['saleq4_l1']) + +# chato +data_rawq['atq_l8'] = data_rawq.groupby(['permno'])['atq'].shift(8) +data_rawq['chato'] = (data_rawq['saleq4']/((data_rawq['atq']+data_rawq['atq_l4'])/2))-(data_rawq['saleq4_l4']/((data_rawq['atq_l4']+data_rawq['atq_l8'])/2)) + +# ala +# data_rawq['ala'] = data_rawq['cheq'] + 0.75*(data_rawq['actq']-data_rawq['cheq'])+\ +# 0.5*(data_rawq['atq']-data_rawq['actq']-data_rawq['gdwlq']-data_rawq['intanq']) + +# alm +# data_rawq['alm'] = data_rawq['ala']/(data_rawq['atq']+data_rawq['me']-data_rawq['ceqq']) + +# noa +data_rawq['ivaoq'] = np.where(data_rawq['ivaoq'].isnull(), 0, 1) +data_rawq['dlcq'] = np.where(data_rawq['dlcq'].isnull(), 0, 1) +data_rawq['dlttq'] = np.where(data_rawq['dlttq'].isnull(), 0, 1) +data_rawq['mibq'] = np.where(data_rawq['mibq'].isnull(), 0, 1) +data_rawq['pstkq'] = np.where(data_rawq['pstkq'].isnull(), 0, 1) +data_rawq['noa'] = (data_rawq['atq']-data_rawq['cheq']-data_rawq['ivaoq'])-\ + (data_rawq['atq']-data_rawq['dlcq']-data_rawq['dlttq']-data_rawq['mibq']-data_rawq['pstkq']-data_rawq['ceqq'])/data_rawq['atq_l4'] + +# rna +data_rawq['noa_l4'] = data_rawq.groupby(['permno'])['noa'].shift(4) +data_rawq['rna'] = data_rawq['oiadpq']/data_rawq['noa_l4'] + +# pm +data_rawq['pm'] = data_rawq['oiadpq']/data_rawq['saleq'] + +# ato +data_rawq['ato'] = data_rawq['saleq']/data_rawq['noa_l4'] + +# roe +data_rawq['ceqq_l1'] = data_rawq.groupby(['permno'])['ceqq'].shift(1) +data_rawq['roe'] = data_rawq['ibq']/data_rawq['ceqq_l1'] + +# Quarterly Accounting Variables +chars_q = data_rawq[['gvkey', 'permno', 'datadate', 'jdate', 'sic', 'exchcd', 'shrcd', 'acc', 'bm', 'cfp', + 'ep', 'agr', 'ni', 'op', 'cash', 'chcsho', 'rd', 'cashdebt', 'pctacc', 'gma', 'lev', + 'rdm', 'sgr', 'sp', 'invest', 'rd_sale', 'lgr', 'roa', 'depr', 'egr', 'roe', + 'chato', 'chpm', 'chtx', 'noa', 'rna', 'pm', 'ato']] +chars_q.reset_index(drop=True, inplace=True) + +####################################################################################################################### +# Momentum # +####################################################################################################################### +crsp_mom = conn.raw_sql(""" + select permno, date, ret, retx, prc, shrout + from crsp.msf + where date >= '01/01/1959' + """) + +crsp_mom['permno'] = crsp_mom['permno'].astype(int) +crsp_mom['jdate'] = pd.to_datetime(crsp_mom['date']) + MonthEnd(0) +crsp_mom = crsp_mom.dropna(subset=['ret', 'retx', 'prc']) + +# add delisting return +dlret = conn.raw_sql(""" + select permno, dlret, dlstdt + from crsp.msedelist + """) + +dlret.permno = dlret.permno.astype(int) +dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) +dlret['jdate'] = dlret['dlstdt'] + MonthEnd(0) + +# merge delisting return to crsp return +crsp_mom = pd.merge(crsp_mom, dlret, how='left', on=['permno', 'jdate']) +crsp_mom['dlret'] = crsp_mom['dlret'].fillna(0) +crsp_mom['ret'] = crsp_mom['ret'].fillna(0) +crsp_mom['retadj'] = (1 + crsp_mom['ret']) * (1 + crsp_mom['dlret']) - 1 +crsp_mom['me'] = crsp_mom['prc'].abs() * crsp_mom['shrout'] # calculate market equity +crsp_mom = crsp_mom.drop(['dlret', 'dlstdt', 'prc', 'shrout'], axis=1) + + +def mom(start, end, df): + """ + + :param start: Order of starting lag + :param end: Order of ending lag + :param df: Dataframe + :return: Momentum factor + """ + lag = pd.DataFrame() + result = 1 + for i in range(start, end): + lag['mom%s' % i] = df.groupby(['permno'])['ret'].shift(i) + result = result * (1+lag['mom%s' % i]) + result = result - 1 + return result + + +crsp_mom['mom60m'] = mom(12, 60, crsp_mom) +crsp_mom['mom12m'] = mom(1, 12, crsp_mom) +crsp_mom['mom1m'] = crsp_mom['ret'] +crsp_mom['mom6m'] = mom(1, 6, crsp_mom) +crsp_mom['mom36m'] = mom(1, 36, crsp_mom) +crsp_mom['seas1a'] = crsp_mom.groupby(['permno'])['ret'].shift(11) + + +# def moms(start, end, df): +# """ +# +# :param start: Order of starting lag +# :param end: Order of ending lag +# :param df: Dataframe +# :return: Momentum factor +# """ +# lag = pd.DataFrame() +# result = 1 +# for i in range(start, end): +# lag['moms%s' % i] = df.groupby['permno']['ret'].shift(i) +# result = result + lag['moms%s' % i] +# result = result/11 +# return result +# +# +# crsp_mom['moms12m'] = moms(1, 12, crsp_mom) + +# populate the chars to monthly + +# chars_a +chars_a = pd.merge(crsp_mom, chars_a, how='left', on=['permno', 'jdate']) +chars_a['datadate'] = chars_a.groupby(['permno'])['datadate'].fillna(method='ffill') +chars_a = chars_a.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') +chars_a = chars_a[((chars_a['exchcd'] == 1) | (chars_a['exchcd'] == 2) | (chars_a['exchcd'] == 3)) & + ((chars_a['shrcd'] == 10) | (chars_a['shrcd'] == 11))] + +# chars_q +chars_q = pd.merge(crsp_mom, chars_q, how='left', on=['permno', 'jdate']) +chars_q['datadate'] = chars_q.groupby(['permno'])['datadate'].fillna(method='ffill') +chars_q = chars_q.groupby(['permno', 'datadate'], as_index=False).fillna(method='ffill') +chars_q = chars_q[((chars_q['exchcd'] == 1) | (chars_q['exchcd'] == 2) | (chars_q['exchcd'] == 3)) & + ((chars_q['shrcd'] == 10) | (chars_q['shrcd'] == 11))] + +with open('chars_a.pkl', 'wb') as f: + pkl.dump(chars_a, f) + +with open('chars_q.pkl', 'wb') as f: + pkl.dump(chars_q, f) \ No newline at end of file diff --git a/pychars/beta.py b/pychars/beta.py new file mode 100755 index 0000000..583806a --- /dev/null +++ b/pychars/beta.py @@ -0,0 +1,70 @@ +# BETA monthly version +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf + from crsp.msf as a + left join ff.factors_daily as b + on a.date=b.date + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +###################### +# Calculate the beta # +###################### +rolling_window = 60 # 60 months + + +# TODO: find a faster way to get rolling sub dataframe +def get_beta(df): + """ + The original idea of calculate beta is using formula (X'MX)^(-1)X'MY, + where M = I - 1(1'1)^{-1}1, I is a identity matrix. + + """ + temp = crsp.loc[df.index] # extract the rolling sub dataframe from original dataframe + X = np.mat(temp[['mktrf']]) + Y = np.mat(temp[['exret']]) + ones = np.mat(np.ones(rolling_window)).T + M = np.identity(rolling_window) - ones.dot((ones.T.dot(ones)).I).dot(ones.T) + beta = (X.T.dot(M).dot(X)).I.dot((X.T.dot(M).dot(Y))) + return beta + + +# calculate beta through rolling window +crsp_temp = crsp.groupby('permno').rolling(rolling_window).apply(get_beta, raw=False) + +# arrange final outcome +crsp_temp = crsp_temp[['mktrf']] # all columns values are beta, we drop extra columns here +crsp_temp = crsp_temp.rename(columns={'mktrf': 'beta'}) +crsp_temp = crsp_temp.reset_index() +crsp['beta'] = crsp_temp['beta'] +crsp = crsp.dropna(subset=['beta']) # drop NA due to rolling +crsp = crsp[['permno', 'date', 'beta']] + +with open('beta.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/pychars/functions.py b/pychars/functions.py new file mode 100755 index 0000000..4ad6b40 --- /dev/null +++ b/pychars/functions.py @@ -0,0 +1,445 @@ +import pandas as pd +import pickle as pkl +import numpy as np +import re + +def ffi49(df): + condlist = [((100 <= df['sic']) & (df['sic'] <= 199)) | ((200 <= df['sic']) & (df['sic'] <= 299)) | + ((700 <= df['sic']) & (df['sic'] <= 799)) | ((910 <= df['sic']) & (df['sic'] <= 919)) | + ((2048 <= df['sic']) & (df['sic'] <= 2048)), + ((2000 <= df['sic']) & (df['sic'] <= 2009)) | ((2010 <= df['sic']) & (df['sic'] <= 2019)) | + ((2020 <= df['sic']) & (df['sic'] <= 2029)) | ((2030 <= df['sic']) & (df['sic'] <= 2039)) | + ((2040 <= df['sic']) & (df['sic'] <= 2046)) | ((2050 <= df['sic']) & (df['sic'] <= 2059)) | + ((2060 <= df['sic']) & (df['sic'] <= 2063)) | ((2070 <= df['sic']) & (df['sic'] <= 2079)) | + ((2090 <= df['sic']) & (df['sic'] <= 2092)) | ((2095 <= df['sic']) & (df['sic'] <= 2095)) | + ((2098 <= df['sic']) & (df['sic'] <= 2099)), + ((2064 <= df['sic']) & (df['sic'] <= 2068)) | ((2086 <= df['sic']) & (df['sic'] <= 2086)) | + ((2087 <= df['sic']) & (df['sic'] <= 2087)) | ((2096 <= df['sic']) & (df['sic'] <= 2096)) | + ((2097 <= df['sic']) & (df['sic'] <= 2097)), + ((2080 <= df['sic']) & (df['sic'] <= 2080)) | ((2082 <= df['sic']) & (df['sic'] <= 2082)) | + ((2083 <= df['sic']) & (df['sic'] <= 2083)) | ((2084 <= df['sic']) & (df['sic'] <= 2084)) | + ((2085 <= df['sic']) & (df['sic'] <= 2085)), + ((2100 <= df['sic']) & (df['sic'] <= 2199)), + ((920 <= df['sic']) & (df['sic'] <= 999)) | ((3650 <= df['sic']) & (df['sic'] <= 3651)) | + ((3652 <= df['sic']) & (df['sic'] <= 3652)) | ((3732 <= df['sic']) & (df['sic'] <= 3732)) | + ((3930 <= df['sic']) & (df['sic'] <= 3931)) | ((3940 <= df['sic']) & (df['sic'] <= 3949)), + ((7800 <= df['sic']) & (df['sic'] <= 7829)) | ((7830 <= df['sic']) & (df['sic'] <= 7833)) | + ((7840 <= df['sic']) & (df['sic'] <= 7841)) | ((7900 <= df['sic']) & (df['sic'] <= 7900)) | + ((7910 <= df['sic']) & (df['sic'] <= 7911)) | ((7920 <= df['sic']) & (df['sic'] <= 7929)) | + ((7930 <= df['sic']) & (df['sic'] <= 7933)) | ((7940 <= df['sic']) & (df['sic'] <= 7949)) | + ((7980 <= df['sic']) & (df['sic'] <= 7980)) | ((7990 <= df['sic']) & (df['sic'] <= 7999)), + ((2700 <= df['sic']) & (df['sic'] <= 2709)) | ((2710 <= df['sic']) & (df['sic'] <= 2719)) | + ((2720 <= df['sic']) & (df['sic'] <= 2729)) | ((2730 <= df['sic']) & (df['sic'] <= 2739)) | + ((2740 <= df['sic']) & (df['sic'] <= 2749)) | ((2770 <= df['sic']) & (df['sic'] <= 2771)) | + ((2780 <= df['sic']) & (df['sic'] <= 2789)) | ((2790 <= df['sic']) & (df['sic'] <= 2799)), + ((2047 <= df['sic']) & (df['sic'] <= 2047)) | ((2391 <= df['sic']) & (df['sic'] <= 2392)) | + ((2510 <= df['sic']) & (df['sic'] <= 2519)) | ((2590 <= df['sic']) & (df['sic'] <= 2599)) | + ((2840 <= df['sic']) & (df['sic'] <= 2843)) | ((2844 <= df['sic']) & (df['sic'] <= 2844)) | + ((3160 <= df['sic']) & (df['sic'] <= 3161)) | ((3170 <= df['sic']) & (df['sic'] <= 3171)) | + ((3172 <= df['sic']) & (df['sic'] <= 3172)) | ((3190 <= df['sic']) & (df['sic'] <= 3199)) | + ((3229 <= df['sic']) & (df['sic'] <= 3229)) | ((3260 <= df['sic']) & (df['sic'] <= 3260)) | + ((3262 <= df['sic']) & (df['sic'] <= 3263)) | ((3269 <= df['sic']) & (df['sic'] <= 3269)) | + ((3230 <= df['sic']) & (df['sic'] <= 3231)) | ((3630 <= df['sic']) & (df['sic'] <= 3639)) | + ((3750 <= df['sic']) & (df['sic'] <= 3751)) | ((3800 <= df['sic']) & (df['sic'] <= 3800)) | + ((3860 <= df['sic']) & (df['sic'] <= 3861)) | ((3870 <= df['sic']) & (df['sic'] <= 3873)) | + ((3910 <= df['sic']) & (df['sic'] <= 3911)) | ((3914 <= df['sic']) & (df['sic'] <= 3914)) | + ((3915 <= df['sic']) & (df['sic'] <= 3915)) | ((3960 <= df['sic']) & (df['sic'] <= 3962)) | + ((3991 <= df['sic']) & (df['sic'] <= 3991)) | ((3995 <= df['sic']) & (df['sic'] <= 3995)), + ((2300 <= df['sic']) & (df['sic'] <= 2390)) | ((3020 <= df['sic']) & (df['sic'] <= 3021)) | + ((3100 <= df['sic']) & (df['sic'] <= 3111)) | ((3130 <= df['sic']) & (df['sic'] <= 3131)) | + ((3140 <= df['sic']) & (df['sic'] <= 3149)) | ((3150 <= df['sic']) & (df['sic'] <= 3151)) | + ((3963 <= df['sic']) & (df['sic'] <= 3965)), + ((8000 <= df['sic']) & (df['sic'] <= 8099)), + ((3693 <= df['sic']) & (df['sic'] <= 3693)) | ((3840 <= df['sic']) & (df['sic'] <= 3849)) | + ((3850 <= df['sic']) & (df['sic'] <= 3851)), + ((2830 <= df['sic']) & (df['sic'] <= 2830)) | ((2831 <= df['sic']) & (df['sic'] <= 2831)) | + ((2833 <= df['sic']) & (df['sic'] <= 2833)) | ((2834 <= df['sic']) & (df['sic'] <= 2834)) | + ((2835 <= df['sic']) & (df['sic'] <= 2835)) | ((2836 <= df['sic']) & (df['sic'] <= 2836)), + ((2800 <= df['sic']) & (df['sic'] <= 2809)) | ((2810 <= df['sic']) & (df['sic'] <= 2819)) | + ((2820 <= df['sic']) & (df['sic'] <= 2829)) | ((2850 <= df['sic']) & (df['sic'] <= 2859)) | + ((2860 <= df['sic']) & (df['sic'] <= 2869)) | ((2870 <= df['sic']) & (df['sic'] <= 2879)) | + ((2890 <= df['sic']) & (df['sic'] <= 2899)), + ((3031 <= df['sic']) & (df['sic'] <= 3031)) | ((3041 <= df['sic']) & (df['sic'] <= 3041)) | + ((3050 <= df['sic']) & (df['sic'] <= 3053)) | ((3060 <= df['sic']) & (df['sic'] <= 3069)) | + ((3070 <= df['sic']) & (df['sic'] <= 3079)) | ((3080 <= df['sic']) & (df['sic'] <= 3089)) | + ((3090 <= df['sic']) & (df['sic'] <= 3099)), + ((2200 <= df['sic']) & (df['sic'] <= 2269)) | ((2270 <= df['sic']) & (df['sic'] <= 2279)) | + ((2280 <= df['sic']) & (df['sic'] <= 2284)) | ((2290 <= df['sic']) & (df['sic'] <= 2295)) | + ((2297 <= df['sic']) & (df['sic'] <= 2297)) | ((2298 <= df['sic']) & (df['sic'] <= 2298)) | + ((2299 <= df['sic']) & (df['sic'] <= 2299)) | ((2393 <= df['sic']) & (df['sic'] <= 2395)) | + ((2397 <= df['sic']) & (df['sic'] <= 2399)), + ((800 <= df['sic']) & (df['sic'] <= 899)) | ((2400 <= df['sic']) & (df['sic'] <= 2439)) | + ((2450 <= df['sic']) & (df['sic'] <= 2459)) | ((2490 <= df['sic']) & (df['sic'] <= 2499)) | + ((2660 <= df['sic']) & (df['sic'] <= 2661)) | ((2950 <= df['sic']) & (df['sic'] <= 2952)) | + ((3200 <= df['sic']) & (df['sic'] <= 3200)) | ((3210 <= df['sic']) & (df['sic'] <= 3211)) | + ((3240 <= df['sic']) & (df['sic'] <= 3241)) | ((3250 <= df['sic']) & (df['sic'] <= 3259)) | + ((3261 <= df['sic']) & (df['sic'] <= 3261)) | ((3264 <= df['sic']) & (df['sic'] <= 3264)) | + ((3270 <= df['sic']) & (df['sic'] <= 3275)) | ((3280 <= df['sic']) & (df['sic'] <= 3281)) | + ((3290 <= df['sic']) & (df['sic'] <= 3293)) | ((3295 <= df['sic']) & (df['sic'] <= 3299)) | + ((3420 <= df['sic']) & (df['sic'] <= 3429)) | ((3430 <= df['sic']) & (df['sic'] <= 3433)) | + ((3440 <= df['sic']) & (df['sic'] <= 3441)) | ((3442 <= df['sic']) & (df['sic'] <= 3442)) | + ((3446 <= df['sic']) & (df['sic'] <= 3446)) | ((3448 <= df['sic']) & (df['sic'] <= 3448)) | + ((3449 <= df['sic']) & (df['sic'] <= 3449)) | ((3450 <= df['sic']) & (df['sic'] <= 3451)) | + ((3452 <= df['sic']) & (df['sic'] <= 3452)) | ((3490 <= df['sic']) & (df['sic'] <= 3499)) | + ((3996 <= df['sic']) & (df['sic'] <= 3996)), + ((1500 <= df['sic']) & (df['sic'] <= 1511)) | ((1520 <= df['sic']) & (df['sic'] <= 1529)) | + ((1530 <= df['sic']) & (df['sic'] <= 1539)) | ((1540 <= df['sic']) & (df['sic'] <= 1549)) | + ((1600 <= df['sic']) & (df['sic'] <= 1699)) | ((1700 <= df['sic']) & (df['sic'] <= 1799)), + ((3300 <= df['sic']) & (df['sic'] <= 3300)) | ((3310 <= df['sic']) & (df['sic'] <= 3317)) | + ((3320 <= df['sic']) & (df['sic'] <= 3325)) | ((3330 <= df['sic']) & (df['sic'] <= 3339)) | + ((3340 <= df['sic']) & (df['sic'] <= 3341)) | ((3350 <= df['sic']) & (df['sic'] <= 3357)) | + ((3360 <= df['sic']) & (df['sic'] <= 3369)) | ((3370 <= df['sic']) & (df['sic'] <= 3379)) | + ((3390 <= df['sic']) & (df['sic'] <= 3399)), + ((3400 <= df['sic']) & (df['sic'] <= 3400)) | ((3443 <= df['sic']) & (df['sic'] <= 3443)) | + ((3444 <= df['sic']) & (df['sic'] <= 3444)) | ((3460 <= df['sic']) & (df['sic'] <= 3469)) | + ((3470 <= df['sic']) & (df['sic'] <= 3479)), + ((3510 <= df['sic']) & (df['sic'] <= 3519)) | ((3520 <= df['sic']) & (df['sic'] <= 3529)) | + ((3530 <= df['sic']) & (df['sic'] <= 3530)) | ((3531 <= df['sic']) & (df['sic'] <= 3531)) | + ((3532 <= df['sic']) & (df['sic'] <= 3532)) | ((3533 <= df['sic']) & (df['sic'] <= 3533)) | + ((3534 <= df['sic']) & (df['sic'] <= 3534)) | ((3535 <= df['sic']) & (df['sic'] <= 3535)) | + ((3536 <= df['sic']) & (df['sic'] <= 3536)) | ((3538 <= df['sic']) & (df['sic'] <= 3538)) | + ((3540 <= df['sic']) & (df['sic'] <= 3549)) | ((3550 <= df['sic']) & (df['sic'] <= 3559)) | + ((3560 <= df['sic']) & (df['sic'] <= 3569)) | ((3580 <= df['sic']) & (df['sic'] <= 3580)) | + ((3581 <= df['sic']) & (df['sic'] <= 3581)) | ((3582 <= df['sic']) & (df['sic'] <= 3582)) | + ((3585 <= df['sic']) & (df['sic'] <= 3585)) | ((3586 <= df['sic']) & (df['sic'] <= 3586)) | + ((3589 <= df['sic']) & (df['sic'] <= 3589)) | ((3590 <= df['sic']) & (df['sic'] <= 3599)), + ((3600 <= df['sic']) & (df['sic'] <= 3600)) | ((3610 <= df['sic']) & (df['sic'] <= 3613)) | + ((3620 <= df['sic']) & (df['sic'] <= 3621)) | ((3623 <= df['sic']) & (df['sic'] <= 3629)) | + ((3640 <= df['sic']) & (df['sic'] <= 3644)) | ((3645 <= df['sic']) & (df['sic'] <= 3645)) | + ((3646 <= df['sic']) & (df['sic'] <= 3646)) | ((3648 <= df['sic']) & (df['sic'] <= 3649)) | + ((3660 <= df['sic']) & (df['sic'] <= 3660)) | ((3690 <= df['sic']) & (df['sic'] <= 3690)) | + ((3691 <= df['sic']) & (df['sic'] <= 3692)) | ((3699 <= df['sic']) & (df['sic'] <= 3699)), + ((2296 <= df['sic']) & (df['sic'] <= 2296)) | ((2396 <= df['sic']) & (df['sic'] <= 2396)) | + ((3010 <= df['sic']) & (df['sic'] <= 3011)) | ((3537 <= df['sic']) & (df['sic'] <= 3537)) | + ((3647 <= df['sic']) & (df['sic'] <= 3647)) | ((3694 <= df['sic']) & (df['sic'] <= 3694)) | + ((3700 <= df['sic']) & (df['sic'] <= 3700)) | ((3710 <= df['sic']) & (df['sic'] <= 3710)) | + ((3711 <= df['sic']) & (df['sic'] <= 3711)) | ((3713 <= df['sic']) & (df['sic'] <= 3713)) | + ((3714 <= df['sic']) & (df['sic'] <= 3714)) | ((3715 <= df['sic']) & (df['sic'] <= 3715)) | + ((3716 <= df['sic']) & (df['sic'] <= 3716)) | ((3792 <= df['sic']) & (df['sic'] <= 3792)) | + ((3790 <= df['sic']) & (df['sic'] <= 3791)) | ((3799 <= df['sic']) & (df['sic'] <= 3799)), + ((3720 <= df['sic']) & (df['sic'] <= 3720)) | ((3721 <= df['sic']) & (df['sic'] <= 3721)) | + ((3723 <= df['sic']) & (df['sic'] <= 3724)) | ((3725 <= df['sic']) & (df['sic'] <= 3725)) | + ((3728 <= df['sic']) & (df['sic'] <= 3729)), + ((3730 <= df['sic']) & (df['sic'] <= 3731)) | ((3740 <= df['sic']) & (df['sic'] <= 3743)), + ((3760 <= df['sic']) & (df['sic'] <= 3769)) | ((3795 <= df['sic']) & (df['sic'] <= 3795)) | + ((3480 <= df['sic']) & (df['sic'] <= 3489)), + ((1040 <= df['sic']) & (df['sic'] <= 1049)), + ((1000 <= df['sic']) & (df['sic'] <= 1009)) | ((1010 <= df['sic']) & (df['sic'] <= 1019)) | + ((1020 <= df['sic']) & (df['sic'] <= 1029)) | ((1030 <= df['sic']) & (df['sic'] <= 1039)) | + ((1050 <= df['sic']) & (df['sic'] <= 1059)) | ((1060 <= df['sic']) & (df['sic'] <= 1069)) | + ((1070 <= df['sic']) & (df['sic'] <= 1079)) | ((1080 <= df['sic']) & (df['sic'] <= 1089)) | + ((1090 <= df['sic']) & (df['sic'] <= 1099)) | ((1100 <= df['sic']) & (df['sic'] <= 1119)) | + ((1400 <= df['sic']) & (df['sic'] <= 1499)), + ((1200 <= df['sic']) & (df['sic'] <= 1299)), + ((1300 <= df['sic']) & (df['sic'] <= 1300)) | ((1310 <= df['sic']) & (df['sic'] <= 1319)) | + ((1320 <= df['sic']) & (df['sic'] <= 1329)) | ((1330 <= df['sic']) & (df['sic'] <= 1339)) | + ((1370 <= df['sic']) & (df['sic'] <= 1379)) | ((1380 <= df['sic']) & (df['sic'] <= 1380)) | + ((1381 <= df['sic']) & (df['sic'] <= 1381)) | ((1382 <= df['sic']) & (df['sic'] <= 1382)) | + ((1389 <= df['sic']) & (df['sic'] <= 1389)) | ((2900 <= df['sic']) & (df['sic'] <= 2912)) | + ((2990 <= df['sic']) & (df['sic'] <= 2999)), + ((4900 <= df['sic']) & (df['sic'] <= 4900)) | ((4910 <= df['sic']) & (df['sic'] <= 4911)) | + ((4920 <= df['sic']) & (df['sic'] <= 4922)) | ((4923 <= df['sic']) & (df['sic'] <= 4923)) | + ((4924 <= df['sic']) & (df['sic'] <= 4925)) | ((4930 <= df['sic']) & (df['sic'] <= 4931)) | + ((4932 <= df['sic']) & (df['sic'] <= 4932)) | ((4939 <= df['sic']) & (df['sic'] <= 4939)) | + ((4940 <= df['sic']) & (df['sic'] <= 4942)), + ((4800 <= df['sic']) & (df['sic'] <= 4800)) | ((4810 <= df['sic']) & (df['sic'] <= 4813)) | + ((4820 <= df['sic']) & (df['sic'] <= 4822)) | ((4830 <= df['sic']) & (df['sic'] <= 4839)) | + ((4840 <= df['sic']) & (df['sic'] <= 4841)) | ((4880 <= df['sic']) & (df['sic'] <= 4889)) | + ((4890 <= df['sic']) & (df['sic'] <= 4890)) | ((4891 <= df['sic']) & (df['sic'] <= 4891)) | + ((4892 <= df['sic']) & (df['sic'] <= 4892)) | ((4899 <= df['sic']) & (df['sic'] <= 4899)), + ((7020 <= df['sic']) & (df['sic'] <= 7021)) | ((7030 <= df['sic']) & (df['sic'] <= 7033)) | + ((7200 <= df['sic']) & (df['sic'] <= 7200)) | ((7210 <= df['sic']) & (df['sic'] <= 7212)) | + ((7214 <= df['sic']) & (df['sic'] <= 7214)) | ((7215 <= df['sic']) & (df['sic'] <= 7216)) | + ((7217 <= df['sic']) & (df['sic'] <= 7217)) | ((7219 <= df['sic']) & (df['sic'] <= 7219)) | + ((7220 <= df['sic']) & (df['sic'] <= 7221)) | ((7230 <= df['sic']) & (df['sic'] <= 7231)) | + ((7240 <= df['sic']) & (df['sic'] <= 7241)) | ((7250 <= df['sic']) & (df['sic'] <= 7251)) | + ((7260 <= df['sic']) & (df['sic'] <= 7269)) | ((7270 <= df['sic']) & (df['sic'] <= 7290)) | + ((7291 <= df['sic']) & (df['sic'] <= 7291)) | ((7292 <= df['sic']) & (df['sic'] <= 7299)) | + ((7395 <= df['sic']) & (df['sic'] <= 7395)) | ((7500 <= df['sic']) & (df['sic'] <= 7500)) | + ((7520 <= df['sic']) & (df['sic'] <= 7529)) | ((7530 <= df['sic']) & (df['sic'] <= 7539)) | + ((7540 <= df['sic']) & (df['sic'] <= 7549)) | ((7600 <= df['sic']) & (df['sic'] <= 7600)) | + ((7620 <= df['sic']) & (df['sic'] <= 7620)) | ((7622 <= df['sic']) & (df['sic'] <= 7622)) | + ((7623 <= df['sic']) & (df['sic'] <= 7623)) | ((7629 <= df['sic']) & (df['sic'] <= 7629)) | + ((7630 <= df['sic']) & (df['sic'] <= 7631)) | ((7640 <= df['sic']) & (df['sic'] <= 7641)) | + ((7690 <= df['sic']) & (df['sic'] <= 7699)) | ((8100 <= df['sic']) & (df['sic'] <= 8199)) | + ((8200 <= df['sic']) & (df['sic'] <= 8299)) | ((8300 <= df['sic']) & (df['sic'] <= 8399)) | + ((8400 <= df['sic']) & (df['sic'] <= 8499)) | ((8600 <= df['sic']) & (df['sic'] <= 8699)) | + ((8800 <= df['sic']) & (df['sic'] <= 8899)) | ((7510 <= df['sic']) & (df['sic'] <= 7515)), + ((2750 <= df['sic']) & (df['sic'] <= 2759)) | ((3993 <= df['sic']) & (df['sic'] <= 3993)) | + ((7218 <= df['sic']) & (df['sic'] <= 7218)) | ((7300 <= df['sic']) & (df['sic'] <= 7300)) | + ((7310 <= df['sic']) & (df['sic'] <= 7319)) | ((7320 <= df['sic']) & (df['sic'] <= 7329)) | + ((7330 <= df['sic']) & (df['sic'] <= 7339)) | ((7340 <= df['sic']) & (df['sic'] <= 7342)) | + ((7349 <= df['sic']) & (df['sic'] <= 7349)) | ((7350 <= df['sic']) & (df['sic'] <= 7351)) | + ((7352 <= df['sic']) & (df['sic'] <= 7352)) | ((7353 <= df['sic']) & (df['sic'] <= 7353)) | + ((7359 <= df['sic']) & (df['sic'] <= 7359)) | ((7360 <= df['sic']) & (df['sic'] <= 7369)) | + ((7374 <= df['sic']) & (df['sic'] <= 7374)) | ((7376 <= df['sic']) & (df['sic'] <= 7376)) | + ((7377 <= df['sic']) & (df['sic'] <= 7377)) | ((7378 <= df['sic']) & (df['sic'] <= 7378)) | + ((7379 <= df['sic']) & (df['sic'] <= 7379)) | ((7380 <= df['sic']) & (df['sic'] <= 7380)) | + ((7381 <= df['sic']) & (df['sic'] <= 7382)) | ((7383 <= df['sic']) & (df['sic'] <= 7383)) | + ((7384 <= df['sic']) & (df['sic'] <= 7384)) | ((7385 <= df['sic']) & (df['sic'] <= 7385)) | + ((7389 <= df['sic']) & (df['sic'] <= 7390)) | ((7391 <= df['sic']) & (df['sic'] <= 7391)) | + ((7392 <= df['sic']) & (df['sic'] <= 7392)) | ((7393 <= df['sic']) & (df['sic'] <= 7393)) | + ((7394 <= df['sic']) & (df['sic'] <= 7394)) | ((7396 <= df['sic']) & (df['sic'] <= 7396)) | + ((7397 <= df['sic']) & (df['sic'] <= 7397)) | ((7399 <= df['sic']) & (df['sic'] <= 7399)) | + ((7519 <= df['sic']) & (df['sic'] <= 7519)) | ((8700 <= df['sic']) & (df['sic'] <= 8700)) | + ((8710 <= df['sic']) & (df['sic'] <= 8713)) | ((8720 <= df['sic']) & (df['sic'] <= 8721)) | + ((8730 <= df['sic']) & (df['sic'] <= 8734)) | ((8740 <= df['sic']) & (df['sic'] <= 8748)) | + ((8900 <= df['sic']) & (df['sic'] <= 8910)) | ((8911 <= df['sic']) & (df['sic'] <= 8911)) | + ((8920 <= df['sic']) & (df['sic'] <= 8999)) | ((4220 <= df['sic']) & (df['sic'] <= 4229)), + ((3570 <= df['sic']) & (df['sic'] <= 3579)) | ((3680 <= df['sic']) & (df['sic'] <= 3680)) | + ((3681 <= df['sic']) & (df['sic'] <= 3681)) | ((3682 <= df['sic']) & (df['sic'] <= 3682)) | + ((3683 <= df['sic']) & (df['sic'] <= 3683)) | ((3684 <= df['sic']) & (df['sic'] <= 3684)) | + ((3685 <= df['sic']) & (df['sic'] <= 3685)) | ((3686 <= df['sic']) & (df['sic'] <= 3686)) | + ((3687 <= df['sic']) & (df['sic'] <= 3687)) | ((3688 <= df['sic']) & (df['sic'] <= 3688)) | + ((3689 <= df['sic']) & (df['sic'] <= 3689)) | ((3695 <= df['sic']) & (df['sic'] <= 3695)), + ((7370 <= df['sic']) & (df['sic'] <= 7372)) | ((7375 <= df['sic']) & (df['sic'] <= 7375)) | + ((7373 <= df['sic']) & (df['sic'] <= 7373)), + ((3622 <= df['sic']) & (df['sic'] <= 3622)) | ((3661 <= df['sic']) & (df['sic'] <= 3661)) | + ((3662 <= df['sic']) & (df['sic'] <= 3662)) | ((3663 <= df['sic']) & (df['sic'] <= 3663)) | + ((3664 <= df['sic']) & (df['sic'] <= 3664)) | ((3665 <= df['sic']) & (df['sic'] <= 3665)) | + ((3666 <= df['sic']) & (df['sic'] <= 3666)) | ((3669 <= df['sic']) & (df['sic'] <= 3669)) | + ((3670 <= df['sic']) & (df['sic'] <= 3679)) | ((3810 <= df['sic']) & (df['sic'] <= 3810)) | + ((3812 <= df['sic']) & (df['sic'] <= 3812)), + ((3811 <= df['sic']) & (df['sic'] <= 3811)) | ((3820 <= df['sic']) & (df['sic'] <= 3820)) | + ((3821 <= df['sic']) & (df['sic'] <= 3821)) | ((3822 <= df['sic']) & (df['sic'] <= 3822)) | + ((3823 <= df['sic']) & (df['sic'] <= 3823)) | ((3824 <= df['sic']) & (df['sic'] <= 3824)) | + ((3825 <= df['sic']) & (df['sic'] <= 3825)) | ((3826 <= df['sic']) & (df['sic'] <= 3826)) | + ((3827 <= df['sic']) & (df['sic'] <= 3827)) | ((3829 <= df['sic']) & (df['sic'] <= 3829)) | + ((3830 <= df['sic']) & (df['sic'] <= 3839)), + ((2520 <= df['sic']) & (df['sic'] <= 2549)) | ((2600 <= df['sic']) & (df['sic'] <= 2639)) | + ((2670 <= df['sic']) & (df['sic'] <= 2699)) | ((2760 <= df['sic']) & (df['sic'] <= 2761)) | + ((3950 <= df['sic']) & (df['sic'] <= 3955)), + ((2440 <= df['sic']) & (df['sic'] <= 2449)) | ((2640 <= df['sic']) & (df['sic'] <= 2659)) | + ((3220 <= df['sic']) & (df['sic'] <= 3221)) | ((3410 <= df['sic']) & (df['sic'] <= 3412)), + ((4000 <= df['sic']) & (df['sic'] <= 4013)) | ((4040 <= df['sic']) & (df['sic'] <= 4049)) | + ((4100 <= df['sic']) & (df['sic'] <= 4100)) | ((4110 <= df['sic']) & (df['sic'] <= 4119)) | + ((4120 <= df['sic']) & (df['sic'] <= 4121)) | ((4130 <= df['sic']) & (df['sic'] <= 4131)) | + ((4140 <= df['sic']) & (df['sic'] <= 4142)) | ((4150 <= df['sic']) & (df['sic'] <= 4151)) | + ((4170 <= df['sic']) & (df['sic'] <= 4173)) | ((4190 <= df['sic']) & (df['sic'] <= 4199)) | + ((4200 <= df['sic']) & (df['sic'] <= 4200)) | ((4210 <= df['sic']) & (df['sic'] <= 4219)) | + ((4230 <= df['sic']) & (df['sic'] <= 4231)) | ((4240 <= df['sic']) & (df['sic'] <= 4249)) | + ((4400 <= df['sic']) & (df['sic'] <= 4499)) | ((4500 <= df['sic']) & (df['sic'] <= 4599)) | + ((4600 <= df['sic']) & (df['sic'] <= 4699)) | ((4700 <= df['sic']) & (df['sic'] <= 4700)) | + ((4710 <= df['sic']) & (df['sic'] <= 4712)) | ((4720 <= df['sic']) & (df['sic'] <= 4729)) | + ((4730 <= df['sic']) & (df['sic'] <= 4739)) | ((4740 <= df['sic']) & (df['sic'] <= 4749)) | + ((4780 <= df['sic']) & (df['sic'] <= 4780)) | ((4782 <= df['sic']) & (df['sic'] <= 4782)) | + ((4783 <= df['sic']) & (df['sic'] <= 4783)) | ((4784 <= df['sic']) & (df['sic'] <= 4784)) | + ((4785 <= df['sic']) & (df['sic'] <= 4785)) | ((4789 <= df['sic']) & (df['sic'] <= 4789)), + ((5000 <= df['sic']) & (df['sic'] <= 5000)) | ((5010 <= df['sic']) & (df['sic'] <= 5015)) | + ((5020 <= df['sic']) & (df['sic'] <= 5023)) | ((5030 <= df['sic']) & (df['sic'] <= 5039)) | + ((5040 <= df['sic']) & (df['sic'] <= 5042)) | ((5043 <= df['sic']) & (df['sic'] <= 5043)) | + ((5044 <= df['sic']) & (df['sic'] <= 5044)) | ((5045 <= df['sic']) & (df['sic'] <= 5045)) | + ((5046 <= df['sic']) & (df['sic'] <= 5046)) | ((5047 <= df['sic']) & (df['sic'] <= 5047)) | + ((5048 <= df['sic']) & (df['sic'] <= 5048)) | ((5049 <= df['sic']) & (df['sic'] <= 5049)) | + ((5050 <= df['sic']) & (df['sic'] <= 5059)) | ((5060 <= df['sic']) & (df['sic'] <= 5060)) | + ((5063 <= df['sic']) & (df['sic'] <= 5063)) | ((5064 <= df['sic']) & (df['sic'] <= 5064)) | + ((5065 <= df['sic']) & (df['sic'] <= 5065)) | ((5070 <= df['sic']) & (df['sic'] <= 5078)) | + ((5080 <= df['sic']) & (df['sic'] <= 5080)) | ((5081 <= df['sic']) & (df['sic'] <= 5081)) | + ((5082 <= df['sic']) & (df['sic'] <= 5082)) | ((5083 <= df['sic']) & (df['sic'] <= 5083)) | + ((5084 <= df['sic']) & (df['sic'] <= 5084)) | ((5085 <= df['sic']) & (df['sic'] <= 5085)) | + ((5086 <= df['sic']) & (df['sic'] <= 5087)) | ((5088 <= df['sic']) & (df['sic'] <= 5088)) | + ((5090 <= df['sic']) & (df['sic'] <= 5090)) | ((5091 <= df['sic']) & (df['sic'] <= 5092)) | + ((5093 <= df['sic']) & (df['sic'] <= 5093)) | ((5094 <= df['sic']) & (df['sic'] <= 5094)) | + ((5099 <= df['sic']) & (df['sic'] <= 5099)) | ((5100 <= df['sic']) & (df['sic'] <= 5100)) | + ((5110 <= df['sic']) & (df['sic'] <= 5113)) | ((5120 <= df['sic']) & (df['sic'] <= 5122)) | + ((5130 <= df['sic']) & (df['sic'] <= 5139)) | ((5140 <= df['sic']) & (df['sic'] <= 5149)) | + ((5150 <= df['sic']) & (df['sic'] <= 5159)) | ((5160 <= df['sic']) & (df['sic'] <= 5169)) | + ((5170 <= df['sic']) & (df['sic'] <= 5172)) | ((5180 <= df['sic']) & (df['sic'] <= 5182)) | + ((5190 <= df['sic']) & (df['sic'] <= 5199)), + ((5200 <= df['sic']) & (df['sic'] <= 5200)) | ((5210 <= df['sic']) & (df['sic'] <= 5219)) | + ((5220 <= df['sic']) & (df['sic'] <= 5229)) | ((5230 <= df['sic']) & (df['sic'] <= 5231)) | + ((5250 <= df['sic']) & (df['sic'] <= 5251)) | ((5260 <= df['sic']) & (df['sic'] <= 5261)) | + ((5270 <= df['sic']) & (df['sic'] <= 5271)) | ((5300 <= df['sic']) & (df['sic'] <= 5300)) | + ((5310 <= df['sic']) & (df['sic'] <= 5311)) | ((5320 <= df['sic']) & (df['sic'] <= 5320)) | + ((5330 <= df['sic']) & (df['sic'] <= 5331)) | ((5334 <= df['sic']) & (df['sic'] <= 5334)) | + ((5340 <= df['sic']) & (df['sic'] <= 5349)) | ((5390 <= df['sic']) & (df['sic'] <= 5399)) | + ((5400 <= df['sic']) & (df['sic'] <= 5400)) | ((5410 <= df['sic']) & (df['sic'] <= 5411)) | + ((5412 <= df['sic']) & (df['sic'] <= 5412)) | ((5420 <= df['sic']) & (df['sic'] <= 5429)) | + ((5430 <= df['sic']) & (df['sic'] <= 5439)) | ((5440 <= df['sic']) & (df['sic'] <= 5449)) | + ((5450 <= df['sic']) & (df['sic'] <= 5459)) | ((5460 <= df['sic']) & (df['sic'] <= 5469)) | + ((5490 <= df['sic']) & (df['sic'] <= 5499)) | ((5500 <= df['sic']) & (df['sic'] <= 5500)) | + ((5510 <= df['sic']) & (df['sic'] <= 5529)) | ((5530 <= df['sic']) & (df['sic'] <= 5539)) | + ((5540 <= df['sic']) & (df['sic'] <= 5549)) | ((5550 <= df['sic']) & (df['sic'] <= 5559)) | + ((5560 <= df['sic']) & (df['sic'] <= 5569)) | ((5570 <= df['sic']) & (df['sic'] <= 5579)) | + ((5590 <= df['sic']) & (df['sic'] <= 5599)) | ((5600 <= df['sic']) & (df['sic'] <= 5699)) | + ((5700 <= df['sic']) & (df['sic'] <= 5700)) | ((5710 <= df['sic']) & (df['sic'] <= 5719)) | + ((5720 <= df['sic']) & (df['sic'] <= 5722)) | ((5730 <= df['sic']) & (df['sic'] <= 5733)) | + ((5734 <= df['sic']) & (df['sic'] <= 5734)) | ((5735 <= df['sic']) & (df['sic'] <= 5735)) | + ((5736 <= df['sic']) & (df['sic'] <= 5736)) | ((5750 <= df['sic']) & (df['sic'] <= 5799)) | + ((5900 <= df['sic']) & (df['sic'] <= 5900)) | ((5910 <= df['sic']) & (df['sic'] <= 5912)) | + ((5920 <= df['sic']) & (df['sic'] <= 5929)) | ((5930 <= df['sic']) & (df['sic'] <= 5932)) | + ((5940 <= df['sic']) & (df['sic'] <= 5940)) | ((5941 <= df['sic']) & (df['sic'] <= 5941)) | + ((5942 <= df['sic']) & (df['sic'] <= 5942)) | ((5943 <= df['sic']) & (df['sic'] <= 5943)) | + ((5944 <= df['sic']) & (df['sic'] <= 5944)) | ((5945 <= df['sic']) & (df['sic'] <= 5945)) | + ((5946 <= df['sic']) & (df['sic'] <= 5946)) | ((5947 <= df['sic']) & (df['sic'] <= 5947)) | + ((5948 <= df['sic']) & (df['sic'] <= 5948)) | ((5949 <= df['sic']) & (df['sic'] <= 5949)) | + ((5950 <= df['sic']) & (df['sic'] <= 5959)) | ((5960 <= df['sic']) & (df['sic'] <= 5969)) | + ((5970 <= df['sic']) & (df['sic'] <= 5979)) | ((5980 <= df['sic']) & (df['sic'] <= 5989)) | + ((5990 <= df['sic']) & (df['sic'] <= 5990)) | ((5992 <= df['sic']) & (df['sic'] <= 5992)) | + ((5993 <= df['sic']) & (df['sic'] <= 5993)) | ((5994 <= df['sic']) & (df['sic'] <= 5994)) | + ((5995 <= df['sic']) & (df['sic'] <= 5995)) | ((5999 <= df['sic']) & (df['sic'] <= 5999)), + ((5800 <= df['sic']) & (df['sic'] <= 5819)) | ((5820 <= df['sic']) & (df['sic'] <= 5829)) | + ((5890 <= df['sic']) & (df['sic'] <= 5899)) | ((7000 <= df['sic']) & (df['sic'] <= 7000)) | + ((7010 <= df['sic']) & (df['sic'] <= 7019)) | ((7040 <= df['sic']) & (df['sic'] <= 7049)) | + ((7213 <= df['sic']) & (df['sic'] <= 7213)), + ((6000 <= df['sic']) & (df['sic'] <= 6000)) | ((6010 <= df['sic']) & (df['sic'] <= 6019)) | + ((6020 <= df['sic']) & (df['sic'] <= 6020)) | ((6021 <= df['sic']) & (df['sic'] <= 6021)) | + ((6022 <= df['sic']) & (df['sic'] <= 6022)) | ((6023 <= df['sic']) & (df['sic'] <= 6024)) | + ((6025 <= df['sic']) & (df['sic'] <= 6025)) | ((6026 <= df['sic']) & (df['sic'] <= 6026)) | + ((6027 <= df['sic']) & (df['sic'] <= 6027)) | ((6028 <= df['sic']) & (df['sic'] <= 6029)) | + ((6030 <= df['sic']) & (df['sic'] <= 6036)) | ((6040 <= df['sic']) & (df['sic'] <= 6059)) | + ((6060 <= df['sic']) & (df['sic'] <= 6062)) | ((6080 <= df['sic']) & (df['sic'] <= 6082)) | + ((6090 <= df['sic']) & (df['sic'] <= 6099)) | ((6100 <= df['sic']) & (df['sic'] <= 6100)) | + ((6110 <= df['sic']) & (df['sic'] <= 6111)) | ((6112 <= df['sic']) & (df['sic'] <= 6113)) | + ((6120 <= df['sic']) & (df['sic'] <= 6129)) | ((6130 <= df['sic']) & (df['sic'] <= 6139)) | + ((6140 <= df['sic']) & (df['sic'] <= 6149)) | ((6150 <= df['sic']) & (df['sic'] <= 6159)) | + ((6160 <= df['sic']) & (df['sic'] <= 6169)) | ((6170 <= df['sic']) & (df['sic'] <= 6179)) | + ((6190 <= df['sic']) & (df['sic'] <= 6199)), + ((6300 <= df['sic']) & (df['sic'] <= 6300)) | ((6310 <= df['sic']) & (df['sic'] <= 6319)) | + ((6320 <= df['sic']) & (df['sic'] <= 6329)) | ((6330 <= df['sic']) & (df['sic'] <= 6331)) | + ((6350 <= df['sic']) & (df['sic'] <= 6351)) | ((6360 <= df['sic']) & (df['sic'] <= 6361)) | + ((6370 <= df['sic']) & (df['sic'] <= 6379)) | ((6390 <= df['sic']) & (df['sic'] <= 6399)) | + ((6400 <= df['sic']) & (df['sic'] <= 6411)), + ((6500 <= df['sic']) & (df['sic'] <= 6500)) | ((6510 <= df['sic']) & (df['sic'] <= 6510)) | + ((6512 <= df['sic']) & (df['sic'] <= 6512)) | ((6513 <= df['sic']) & (df['sic'] <= 6513)) | + ((6514 <= df['sic']) & (df['sic'] <= 6514)) | ((6515 <= df['sic']) & (df['sic'] <= 6515)) | + ((6517 <= df['sic']) & (df['sic'] <= 6519)) | ((6520 <= df['sic']) & (df['sic'] <= 6529)) | + ((6530 <= df['sic']) & (df['sic'] <= 6531)) | ((6532 <= df['sic']) & (df['sic'] <= 6532)) | + ((6540 <= df['sic']) & (df['sic'] <= 6541)) | ((6550 <= df['sic']) & (df['sic'] <= 6553)) | + ((6590 <= df['sic']) & (df['sic'] <= 6599)) | ((6610 <= df['sic']) & (df['sic'] <= 6611)), + ((6200 <= df['sic']) & (df['sic'] <= 6299)) | ((6700 <= df['sic']) & (df['sic'] <= 6700)) | + ((6710 <= df['sic']) & (df['sic'] <= 6719)) | ((6720 <= df['sic']) & (df['sic'] <= 6722)) | + ((6723 <= df['sic']) & (df['sic'] <= 6723)) | ((6724 <= df['sic']) & (df['sic'] <= 6724)) | + ((6725 <= df['sic']) & (df['sic'] <= 6725)) | ((6726 <= df['sic']) & (df['sic'] <= 6726)) | + ((6730 <= df['sic']) & (df['sic'] <= 6733)) | ((6740 <= df['sic']) & (df['sic'] <= 6779)) | + ((6790 <= df['sic']) & (df['sic'] <= 6791)) | ((6792 <= df['sic']) & (df['sic'] <= 6792)) | + ((6793 <= df['sic']) & (df['sic'] <= 6793)) | ((6794 <= df['sic']) & (df['sic'] <= 6794)) | + ((6795 <= df['sic']) & (df['sic'] <= 6795)) | ((6798 <= df['sic']) & (df['sic'] <= 6798)) | + ((6799 <= df['sic']) & (df['sic'] <= 6799)), + ((4950 <= df['sic']) & (df['sic'] <= 4959)) | ((4960 <= df['sic']) & (df['sic'] <= 4961)) | + ((4970 <= df['sic']) & (df['sic'] <= 4971)) | ((4990 <= df['sic']) & (df['sic'] <= 4991))] + choicelist = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] + return np.select(condlist, choicelist, default=np.nan) + + +def fillna_atq(df_q, df_a): + # fina columns are na in df_q and exist in df_a + df_q_na_list = df_q.columns[df_q.isna().any()].tolist() + df_a_columns_list = df_a.columns.values.tolist() + list_temp = list(set(df_q_na_list) & set(df_a_columns_list)) + # remove mom columns, mom chars are same in annual and quarterly + na_columns_list = [] + for i in list_temp: + if re.match(r'mom.', i) is None: + na_columns_list.append(i) + # get annual columns from df_a + df_temp = df_a[na_columns_list].copy() + df_temp[['permno', 'jdate']] = df_a[['permno', 'jdate']].copy() + # rename annual columns in the form of 'chars_a' + for na_column in na_columns_list: + df_temp = df_temp.rename(columns={'%s' % na_column: '%s_a' % na_column}) + df_temp = df_temp.reset_index(drop=True) + # use annual chars to fill quarterly na + df_q = pd.merge(df_q, df_temp, how='left', on=['permno', 'jdate']) + for na_column in na_columns_list: + df_q['%s' % na_column] = np.where(df_q['%s' % na_column].isnull(), df_q['%s_a' % na_column], df_q['%s' % na_column]) + df_q = df_q.drop(['%s_a' % na_column], axis=1) + return df_q + + +def fillna_ind(df, method, ffi): + df_fill = pd.DataFrame() + na_columns_list = df.columns[df.isna().any()].tolist() + for na_column in na_columns_list: + if method == 'mean': + df_temp = df.groupby(['jdate', 'ffi%s' % ffi])['%s' % na_column].mean() + elif method == 'median': + df_temp = df.groupby(['jdate', 'ffi%s' % ffi])['%s' % na_column].median() + else: + None + df_fill = pd.concat([df_fill, df_temp], axis=1) + if method == 'mean': + df_fill = df_fill.rename(columns={'%s' % na_column: '%s_mean' % na_column}) + elif method == 'median': + df_fill = df_fill.rename(columns={'%s' % na_column: '%s_median' % na_column}) + else: + None + df_fill = df_fill.reset_index() + # reset multiple index to jdate and ffi code + df_fill['index'] = df_fill['index'].astype(str) + index_temp = df_fill['index'].str.split(',', expand=True) + index_temp.columns = ['jdate', 'ffi%s' % ffi] + index_temp['jdate'] = index_temp['jdate'].str.strip('(Timestamp(\' \')') + index_temp['ffi%s' % ffi] = index_temp['ffi%s' % ffi].str.strip(')') + df_fill[['jdate', 'ffi%s' % ffi]] = index_temp[['jdate', 'ffi%s' % ffi]] + df_fill = df_fill.drop(['index'], axis=1) + df_fill['jdate'] = pd.to_datetime(df_fill['jdate']) + df_fill['ffi49'] = df_fill['ffi49'].astype(int) + # fill na + df = pd.merge(df, df_fill, how='left', on=['jdate', 'ffi%s' % ffi]) + for na_column in na_columns_list: + if method == 'mean': + df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column]) + df = df.drop(['%s_mean' % na_column], axis=1) + elif method == 'median': + df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_median' % na_column]) + df = df.drop(['%s_median' % na_column], axis=1) + else: + None + return df + + +def fillna_all(df, method): + df_fill = pd.DataFrame() + na_columns_list = df.columns[df.isna().any()].tolist() + for na_column in na_columns_list: + if method == 'mean': + df_temp = df.groupby(['jdate'])['%s' % na_column].mean() + elif method == 'median': + df_temp = df.groupby(['jdate'])['%s' % na_column].median() + else: + None + df_fill = pd.concat([df_fill, df_temp], axis=1) + if method == 'mean': + df_fill = df_fill.rename(columns={'%s' % na_column: '%s_mean' % na_column}) + elif method == 'median': + df_fill = df_fill.rename(columns={'%s' % na_column: '%s_median' % na_column}) + else: + None + df_fill = df_fill.reset_index() + # reset multiple index to jdate and ffi code + df_fill['index'] = df_fill['index'].astype(str) + index_temp = df_fill['index'].str.split(',', expand=True) + index_temp.columns = ['jdate'] + index_temp['jdate'] = index_temp['jdate'].str.strip('(Timestamp(\' \')') + df_fill[['jdate']] = index_temp[['jdate']] + df_fill = df_fill.drop(['index'], axis=1) + df_fill['jdate'] = pd.to_datetime(df_fill['jdate']) + # fill na + df = pd.merge(df, df_fill, how='left', on='jdate') + for na_column in na_columns_list: + if method == 'mean': + df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_mean' % na_column]) + df = df.drop(['%s_mean' % na_column], axis=1) + elif method == 'median': + df['%s' % na_column] = df['%s' % na_column].fillna(df['%s_median' % na_column]) + df = df.drop(['%s_median' % na_column], axis=1) + else: + None + return df + + +def standardize(df): + df_temp = df.groupby(['jdate'], as_index=False)['gvkey'].count() + df_temp = df_temp.rename(columns={'gvkey': 'count'}) + df = pd.merge(df, df_temp, how='left', on='jdate') + col_names = df.columns.values.tolist() + list_to_remove = ['permno', 'date', 'jdate', 'datadate', 'gvkey', 'sic', 'count', 'exchcd', 'shrcd'] + col_names = list(set(col_names).difference(set(list_to_remove))) + df = df.fillna(0) + for col_name in col_names: + df['%s_rank' % col_name] = df.groupby(['jdate'])['%s' % col_name].rank() + df['rank_%s' % col_name] = (df['%s_rank' % col_name]-1)/(df['count']-1)*2 - 1 + df = df.drop(['%s_rank' % col_name, '%s' % col_name], axis=1) + return df \ No newline at end of file diff --git a/pychars/hxz_abr.py b/pychars/hxz_abr.py new file mode 100755 index 0000000..ecb5219 --- /dev/null +++ b/pychars/hxz_abr.py @@ -0,0 +1,236 @@ +# Calculate HSZ Replicating Anomalies +# ABR: Cumulative abnormal stock returns around earnings announcements + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import pickle as pkl +import sqlite3 + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +################### +# Compustat Block # +################### +comp = conn.raw_sql(""" + select gvkey, datadate, rdq, fyearq, fqtr + from comp.fundq + where indfmt = 'INDL' + and datafmt = 'STD' + and popsrc = 'D' + and consol = 'C' + and datadate >= '01/01/1959' + """) + +comp['datadate'] = pd.to_datetime(comp['datadate']) + +print('='*10, 'comp data is ready', '='*10) +################### +# CCM Block # +################### +ccm = conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where linktype in ('LU', 'LC') + """) + +ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) +ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) + +# if linkenddt is missing then set to today date +ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) + +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) +# extract month and year of rdq +ccm1['rdq'] = pd.to_datetime(ccm1['rdq']) + +# set link date bounds +ccm2 = ccm1[(ccm1['datadate']>=ccm1['linkdt']) & (ccm1['datadate']<=ccm1['linkenddt'])] +ccm2 = ccm2[['gvkey', 'datadate', 'rdq', 'fyearq', 'fqtr', 'permno']] + +################### +# CRSP Block # +################### + +# Report Date of Quarterly Earnings (rdq) may not be trading day, we need to get the first trading day on or after rdq +crsp_dsi = conn.raw_sql(""" + select distinct date + from crsp.dsi + where date >= '01/01/1959' + """) + +crsp_dsi['date'] = pd.to_datetime(crsp_dsi['date']) + +for i in range(6): # we only consider the condition that the day after rdq is not a trading day, which is up to 5 days + ccm2['trad_%s' % i] = ccm2['rdq'] + pd.DateOffset(days=i) # set rdq + i days to match trading day + crsp_dsi['trad_%s' % i] = crsp_dsi['date'] # set the merging key + crsp_dsi = crsp_dsi[['date', 'trad_%s' % i]] # reset trading day columns to avoid repeat merge + comp_temp = pd.merge(ccm2, crsp_dsi, how='left', on='trad_%s' % i) + comp_temp['trad_%s' % i] = comp_temp['date'] # reset rdq + i days to matched trading day + +# fill NA from rdq + 5 days to rdq + 0 days, then get trading day version of rdq +for i in range(5, 0, -1): + count = i-1 + comp_temp['trad_%s' % count] = np.where(comp_temp['trad_%s' % count].isnull(), + comp_temp['trad_%s' % i], comp_temp['trad_%s' % count]) + comp_temp['rdq_trad'] = comp_temp['trad_%s' % count] + +comp_temp = comp_temp[['gvkey', 'permno', 'datadate', 'fyearq', 'fqtr', 'rdq', 'rdq_trad']] + +print('='*10, 'crsp block is ready', '='*10) +############################# +# CRSP abnormal return # +############################# +crsp_d = conn.raw_sql(""" + select a.prc, a.ret, a.shrout, a.vol, a.cfacpr, a.cfacshr, a.permno, a.permco, a.date, + b.siccd, b.ncusip, b.shrcd, b.exchcd + from crsp.dsf as a + left join crsp.dsenames as b + on a.permno=b.permno + and b.namedt<=a.date + and a.date<=b.nameendt + where a.date >= '01/01/1959' + and b.exchcd between 1 and 3 + and b.shrcd in (10,11) + """) + +# change variable format to int +crsp_d[['permco', 'permno', 'shrcd', 'exchcd']] = crsp_d[['permco', 'permno', 'shrcd', 'exchcd']].astype(int) + +print('='*10, 'crsp abnormal return is ready', '='*10) + +# convert the date format +crsp_d['date'] = pd.to_datetime(crsp_d['date']) + +# add delisting return +dlret = conn.raw_sql(""" + select permno, dlret, dlstdt + from crsp.dsedelist + where dlstdt >= '01/01/1959' + """) + +dlret.permno = dlret.permno.astype(int) +dlret['dlstdt'] = pd.to_datetime(dlret['dlstdt']) + +crsp_d = pd.merge(crsp_d, dlret, how='left', left_on=['permno', 'date'], right_on=['permno', 'dlstdt']) +# return adjusted for delisting +crsp_d['retadj'] = np.where(crsp_d['dlret'].notna(), (crsp_d['ret'] + 1)*(crsp_d['dlret'] + 1) - 1, crsp_d['ret']) +crsp_d['meq'] = crsp_d['prc'].abs()*crsp_d['shrout'] # market value of equity +crsp_d = crsp_d.sort_values(by=['date', 'permno', 'meq']) + +# sprtrn +crspsp500d = conn.raw_sql(""" + select date, sprtrn + from crsp.dsi + where date >= '01/01/1959' + """) + +crspsp500d['date'] = pd.to_datetime(crspsp500d['date']) + +# abnormal return +crsp_d = pd.merge(crsp_d, crspsp500d, how='left', on='date') +crsp_d['abrd'] = crsp_d['retadj'] - crsp_d['sprtrn'] +crsp_d = crsp_d[['date', 'permno', 'ret', 'retadj', 'sprtrn', 'abrd']] + +# date count regarding to rdq +comp_temp['minus10d'] = comp_temp['rdq_trad'] - pd.Timedelta(days=10) +comp_temp['plus5d'] = comp_temp['rdq_trad'] + pd.Timedelta(days=5) + +# df = sqldf("""select a.*, b.date, b.abrd +# from comp_temp a left join crsp_d b +# on a.permno=b.permno +# and a.minus10d<=b.date +# and b.date<=a.plus5d +# order by a.permno, a.rdq_trad, b.date;""", globals()) + +sql = sqlite3.connect(':memory:') +comp_temp.to_sql('comp_temp', sql, index=False) +crsp_d.to_sql('crsp_d', sql, index=False) + +qry = """select a.*, b.date, b.abrd + from comp_temp a left join crsp_d b + on a.permno=b.permno + and a.minus10d<=b.date + and b.date<=a.plus5d + order by a.permno, a.rdq_trad, b.date;""" +df = pd.read_sql_query(qry, sql) +df.drop(['plus5d', 'minus10d'], axis=1, inplace=True) + +# delete missing return +df = df[df['abrd'].notna()] + +# count +df.sort_values(by=['permno', 'rdq_trad', 'date'], inplace=True) +condlist = [df['date']==df['rdq_trad'], + df['date']>df['rdq_trad'], + df['date']=0] +df_after['count'] = df_after.groupby(['permno', 'rdq_trad'])['date'].cumcount() + +df = pd.concat([df_before, df_after]) + +# calculate abr as the group sum +df = df[(df['count']>=-2) & (df['count']<=1)] + +df_temp = df.groupby(['permno', 'rdq_trad'])['abrd'].sum() +df_temp = pd.DataFrame(df_temp) +df_temp.reset_index(inplace=True) +df_temp.rename(columns={'abrd': 'abr'}, inplace=True) +df = pd.merge(df, df_temp, how='left', on=['permno', 'rdq_trad'], copy=False) # add abr back to df +df = df[df['count']==1] +df.rename(columns={'date': 'rdq_plus_1d'}, inplace=True) +df = df[['gvkey', 'permno', 'datadate', 'rdq', 'rdq_plus_1d', 'abr']] + +print('='*10, 'start populate', '='*10) + +# populate the quarterly abr to monthly +crsp_msf = conn.raw_sql(""" + select distinct date + from crsp.msf + where date >= '01/01/1959' + """) + +df['datadate'] = pd.to_datetime(df['datadate']) +df['plus12m'] = df['datadate'] + np.timedelta64(12, 'M') +df['plus12m'] = df['plus12m'] + MonthEnd(0) + +# df = sqldf("""select a.*, b.date +# from df a left join crsp_msf b +# on a.rdq_plus_1d < b.date +# and a.plus12m >= b.date +# order by a.permno, b.date, a.datadate desc;""", globals()) + +df.to_sql('df', sql, index=False) +crsp_msf.to_sql('crsp_msf', sql, index=False) + +qry = """select a.*, b.date + from df a left join crsp_msf b + on a.rdq_plus_1d < b.date + and a.plus12m >= b.date + order by a.permno, b.date, a.datadate desc;""" + +df = pd.read_sql_query(qry, sql) + +df = df.drop_duplicates(['permno', 'date']) +df['datadate'] = pd.to_datetime(df['datadate']) +df['rdq'] = pd.to_datetime(df['rdq']) +df['rdq_plus_1d'] = pd.to_datetime(df['rdq_plus_1d']) +df = df[['gvkey', 'permno', 'datadate', 'rdq', 'rdq_plus_1d', 'abr', 'date']] + +with open('abr.pkl', 'wb') as f: + pkl.dump(df, f) \ No newline at end of file diff --git a/pychars/hxz_re.py b/pychars/hxz_re.py new file mode 100755 index 0000000..7dab02f --- /dev/null +++ b/pychars/hxz_re.py @@ -0,0 +1,120 @@ +# Calculate HSZ Replicating Anomalies +# RE: Revisions in analysts’ earnings forecasts + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +from pandasql import * +import pickle as pkl + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +######################################################################### +# Merging IBES and CRSP by using ICLINK table. Merging last month price # +######################################################################### + +with open('iclink.pkl', 'rb')as f: + iclink = pkl.load(f) + +ibes = conn.raw_sql(""" + select + ticker, statpers, meanest, fpedats, anndats_act, curr_act, fpi, medest + from ibes.statsum_epsus + where + /* filtering IBES */ + statpers=0 + and CURCODE='USD' + and fpi in ('1','2')""") + +# filtering IBES +ibes = ibes[(ibes['medest'].notna()) & (ibes['fpedats'].notna())] +ibes = ibes[(ibes['curr_act']=='USD') | (ibes['curr_act'].isnull())] +ibes['statpers'] = pd.to_datetime(ibes['statpers']) +ibes['merge_date'] = ibes['statpers']+MonthEnd(0) + +crsp_msf = conn.raw_sql(""" + select permno, date, prc, cfacpr + from crsp.msf + """) + +crsp_msf['date'] = pd.to_datetime(crsp_msf['date']) +crsp_msf['date'] = crsp_msf['date']+MonthEnd(0) +crsp_msf['merge_date'] = crsp_msf['date']+MonthEnd(1) + +ibes_iclink = pd.merge(ibes, iclink, how='left', on='ticker') +ibes_crsp = pd.merge(ibes_iclink, crsp_msf, how='inner', on=['permno', 'merge_date']) +ibes_crsp.sort_values(by=['ticker', 'fpedats', 'statpers'], inplace=True) +ibes_crsp.reset_index(inplace=True, drop=True) + +############################### +# Merging last month forecast # +############################### +ibes_crsp['statpers_last_month'] = np.where((ibes_crsp['ticker'] == ibes_crsp['ticker'].shift(1)) & + (ibes_crsp['permno'] == ibes_crsp['permno'].shift(1)) & + (ibes_crsp['fpedats'] == ibes_crsp['fpedats'].shift(1)), + ibes_crsp['statpers'].shift(1).astype(str), np.nan) + +ibes_crsp['meanest_last_month'] = np.where((ibes_crsp['ticker'] == ibes_crsp['ticker'].shift(1)) & + (ibes_crsp['permno'] == ibes_crsp['permno'].shift(1)) & + (ibes_crsp['fpedats'] == ibes_crsp['fpedats'].shift(1)), + ibes_crsp['meanest'].shift(1), np.nan) + +ibes_crsp.sort_values(by=['ticker', 'permno', 'fpedats', 'statpers'], inplace=True) +ibes_crsp.reset_index(inplace=True, drop=True) + +########################### +# Drop empty "last month" # +# Calculate HXZ RE # +########################### + +ibes_crsp = ibes_crsp[ibes_crsp['statpers_last_month'].notna()] +ibes_crsp['prc_adj'] = ibes_crsp['prc']/ibes_crsp['cfacpr'] +ibes_crsp = ibes_crsp[ibes_crsp['prc_adj']>0] +ibes_crsp['monthly_revision'] = (ibes_crsp['meanest'] - ibes_crsp['meanest_last_month'])/ibes_crsp['prc_adj'] + +ibes_crsp['permno'] = ibes_crsp['permno'].astype(int) +ibes_crsp['permno'] = ibes_crsp['permno'].astype(str) +ibes_crsp['fpedats'] = ibes_crsp['fpedats'].astype(str) +ibes_crsp['permno_fpedats'] = ibes_crsp['permno'].str.cat(ibes_crsp['fpedats'], sep='-') + +ibes_crsp = ibes_crsp.drop_duplicates(['permno_fpedats', 'statpers']) +ibes_crsp['count'] = ibes_crsp.groupby('permno_fpedats').cumcount() + 1 + +######################## +# Calculate RE (CJL) # +######################## + +ibes_crsp['monthly_revision_l1'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(1) +ibes_crsp['monthly_revision_l2'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(2) +ibes_crsp['monthly_revision_l3'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(3) +ibes_crsp['monthly_revision_l4'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(4) +ibes_crsp['monthly_revision_l5'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(5) +ibes_crsp['monthly_revision_l6'] = ibes_crsp.groupby(['permno'])['monthly_revision'].shift(6) + +condlist = [ibes_crsp['count']==4, + ibes_crsp['count']==5, + ibes_crsp['count']==6, + ibes_crsp['count']>=7] +choicelist = [(ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'])/3, + (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'])/4, + (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'] + ibes_crsp['monthly_revision_l5'])/5, + (ibes_crsp['monthly_revision_l1'] + ibes_crsp['monthly_revision_l2'] + ibes_crsp['monthly_revision_l3'] + ibes_crsp['monthly_revision_l4'] + ibes_crsp['monthly_revision_l5'] + ibes_crsp['monthly_revision_l6'])/6] +ibes_crsp['re'] = np.select(condlist, choicelist, default=np.nan) + +ibes_crsp = ibes_crsp[ibes_crsp['count']>=4] +ibes_crsp = ibes_crsp.sort_values(by=['ticker', 'statpers', 'fpedats']) +ibes_crsp = ibes_crsp.drop_duplicates(['ticker', 'statpers']) + +ibes_crsp = ibes_crsp[['ticker', 'statpers', 'fpedats', 'anndats_act', 'curr_act', 'permno', 're']] +ibes_crsp.rename(columns={'statpers': 'date'}, inplace=True) + +with open('re.pkl', 'wb') as f: + pkl.dump(ibes_crsp, f) \ No newline at end of file diff --git a/pychars/hxz_sue.py b/pychars/hxz_sue.py new file mode 100755 index 0000000..8238cdb --- /dev/null +++ b/pychars/hxz_sue.py @@ -0,0 +1,106 @@ +# Calculate HSZ Replicating Anomalies +# SUE: Standardized Unexpected Earnings (Earnings surprise) + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +from pandasql import * +import pickle as pkl + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +################### +# Compustat Block # +################### +comp = conn.raw_sql(""" + select gvkey, datadate, fyearq, fqtr, epspxq, ajexq + from comp.fundq + where indfmt = 'INDL' + and datafmt = 'STD' + and popsrc = 'D' + and consol = 'C' + and datadate >= '01/01/1959' + """) + +comp['datadate'] = pd.to_datetime(comp['datadate']) + +################### +# CCM Block # +################### +ccm = conn.raw_sql(""" + select gvkey, lpermno as permno, linktype, linkprim, + linkdt, linkenddt + from crsp.ccmxpf_linktable + where linktype in ('LU', 'LC') + """) + +ccm['linkdt'] = pd.to_datetime(ccm['linkdt']) +ccm['linkenddt'] = pd.to_datetime(ccm['linkenddt']) +# if linkenddt is missing then set to today date +ccm['linkenddt'] = ccm['linkenddt'].fillna(pd.to_datetime('today')) + +ccm1 = pd.merge(comp, ccm, how='left', on=['gvkey']) + +# set link date bounds +ccm2 = ccm1[(ccm1['datadate']>=ccm1['linkdt']) & (ccm1['datadate']<=ccm1['linkenddt'])] +ccm2 = ccm2[['gvkey', 'permno', 'datadate', 'fyearq', 'fqtr', 'epspxq', 'ajexq']] + +# the time series of exspxq/ajexq +ccm2['eps'] = ccm2['epspxq']/ccm2['ajexq'] +ccm2.drop_duplicates(['permno', 'datadate'], inplace=True) + +# merge lag1 to lag9, then calculate stand deviation +ccm2 = ccm2[ccm2['eps'].notna()] +ccm2['count'] = ccm2.groupby('permno').cumcount() + 1 +ccm2.sort_values(by=['permno', 'datadate'], inplace=True) + +ccm2['e1'] = ccm2.groupby(['permno'])['eps'].shift(1) +ccm2['e2'] = ccm2.groupby(['permno'])['eps'].shift(2) +ccm2['e3'] = ccm2.groupby(['permno'])['eps'].shift(3) +ccm2['e4'] = ccm2.groupby(['permno'])['eps'].shift(4) +ccm2['e5'] = ccm2.groupby(['permno'])['eps'].shift(5) +ccm2['e6'] = ccm2.groupby(['permno'])['eps'].shift(6) +ccm2['e7'] = ccm2.groupby(['permno'])['eps'].shift(7) +ccm2['e8'] = ccm2.groupby(['permno'])['eps'].shift(8) + +condlist = [ccm2['count']<=6, + ccm2['count']==7, + ccm2['count']==8, + ccm2['count']>=9] +choicelist = [np.nan, + ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3']].std(axis=1), + ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3', 'e2']].std(axis=1), + ccm2[['e8', 'e7', 'e6', 'e5', 'e4', 'e3', 'e2', 'e1']].std(axis=1)] +ccm2['sue_std'] = np.select(condlist, choicelist, default=np.nan) + +ccm2['sue'] = (ccm2['eps'] - ccm2['e4'])/ccm2['sue_std'] + +# populate the quarterly sue to monthly +crsp_msf = conn.raw_sql(""" + select distinct date + from crsp.msf + where date >= '01/01/1959' + """) + +ccm2['datadate'] = pd.to_datetime(ccm2['datadate']) +ccm2['plus12m'] = ccm2['datadate'] + np.timedelta64(12, 'M') +ccm2['plus12m'] = ccm2['plus12m'] + MonthEnd(0) + +df = sqldf("""select a.*, b.date + from ccm2 a left join crsp_msf b + on a.datadate <= b.date + and a.plus12m >= b.date + order by a.permno, b.date, a.datadate desc;""", globals()) + +df = df.drop_duplicates(['permno', 'date']) +df['datadate'] = pd.to_datetime(df['datadate']) +df = df[['gvkey', 'permno', 'datadate', 'date', 'sue']] + +with open('sue.pkl', 'wb') as f: + pkl.dump(df, f) \ No newline at end of file diff --git a/pychars/iclink.py b/pychars/iclink.py new file mode 100755 index 0000000..c630697 --- /dev/null +++ b/pychars/iclink.py @@ -0,0 +1,241 @@ +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +from pandasql import * +from fuzzywuzzy import fuzz + +# reference: https://wrds-www.wharton.upenn.edu/pages/support/applications/python-replications/linking-ibes-and-crsp-data-python/ +##################################### +# ICLINK: Link CRSP and IBES # +# June 2019 # +# Qingyi (Freda) Song Drechsler # +##################################### + +# This program replicates the SAS macro ICLINK +# to create a linking table between CRSP and IBES +# Output is a score reflecting the quality of the link +# Score = 0 (best link) to Score = 6 (worst link) +# +# More explanation on score system: +# - 0: BEST match: using (cusip, cusip dates and company names) +# or (exchange ticker, company names and 6-digit cusip) +# - 1: Cusips and cusip dates match but company names do not match +# - 2: Cusips and company names match but cusip dates do not match +# - 3: Cusips match but cusip dates and company names do not match +# - 4: tickers and 6-digit cusips match but company names do not match +# - 5: tickers and company names match but 6-digit cusips do not match +# - 6: tickers match but company names and 6-digit cusips do not match + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +######################### +# Step 1: Link by CUSIP # +######################### + +# 1.1 IBES: Get the list of IBES Tickers for US firms in IBES +_ibes1 = conn.raw_sql(""" + select ticker, cusip, cname, sdates from ibes.id + where usfirm=1 and cusip != '' + """) + +# Create first and last 'start dates' for a given cusip +# Use agg min and max to find the first and last date per group +# then rename to fdate and ldate respectively + +_ibes1_date = _ibes1.groupby(['ticker','cusip']).sdates.agg(['min', 'max'])\ +.reset_index().rename(columns={'min':'fdate', 'max':'ldate'}) + +# merge fdate ldate back to _ibes1 data +_ibes2 = pd.merge(_ibes1, _ibes1_date,how='left', on =['ticker','cusip']) +_ibes2 = _ibes2.sort_values(by=['ticker','cusip','sdates']) + +# keep only the most recent company name +# determined by having sdates = ldate +_ibes2 = _ibes2.loc[_ibes2.sdates == _ibes2.ldate].drop(['sdates'], axis=1) + +# 1.2 CRSP: Get all permno-ncusip combinations +_crsp1 = conn.raw_sql(""" + select permno, ncusip, comnam, namedt, nameenddt + from crsp.stocknames + where ncusip != '' + """) + +# first namedt +_crsp1_fnamedt = _crsp1.groupby(['permno','ncusip']).namedt.min().reset_index() + +# last nameenddt +_crsp1_lnameenddt = _crsp1.groupby(['permno','ncusip']).nameenddt.max().reset_index() + +# merge both +_crsp1_dtrange = pd.merge(_crsp1_fnamedt, _crsp1_lnameenddt, \ + on = ['permno','ncusip'], how='inner') + +# replace namedt and nameenddt with the version from the dtrange +_crsp1 = _crsp1.drop(['namedt'],axis=1).rename(columns={'nameenddt':'enddt'}) +_crsp2 = pd.merge(_crsp1, _crsp1_dtrange, on =['permno','ncusip'], how='inner') + +# keep only most recent company name +_crsp2 = _crsp2.loc[_crsp2.enddt ==_crsp2.nameenddt].drop(['enddt'], axis=1) + +# 1.3 Create CUSIP Link Table + +# Link by full cusip, company names and dates +_link1_1 = pd.merge(_ibes2, _crsp2, how='inner', left_on='cusip', right_on='ncusip')\ +.sort_values(['ticker','permno','ldate']) + +# Keep link with most recent company name +_link1_1_tmp = _link1_1.groupby(['ticker','permno']).ldate.max().reset_index() +_link1_2 = pd.merge(_link1_1, _link1_1_tmp, how='inner', on =['ticker', 'permno', 'ldate']) + + +# Calculate name matching ratio using FuzzyWuzzy + +# Note: fuzz ratio = 100 -> match perfectly +# fuzz ratio = 0 -> do not match at all + +# Comment: token_set_ratio is more flexible in matching the strings: +# fuzz.token_set_ratio('AMAZON.COM INC', 'AMAZON COM INC') +# returns value of 100 + +# fuzz.ratio('AMAZON.COM INC', 'AMAZON COM INC') +# returns value of 93 + +_link1_2['name_ratio'] = _link1_2.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1) + +# Note on parameters: +# The following parameters are chosen to mimic the SAS macro %iclink +# In %iclink, name_dist < 30 is assigned score = 0 +# where name_dist=30 is roughly 90% percentile in total distribution +# and higher name_dist means more different names. +# In name_ratio, I mimic this by choosing 10% percentile as cutoff to assign +# score = 0 + +# 10% percentile of the company name distance +name_ratio_p10 = _link1_2.name_ratio.quantile(0.10) + +# Function to assign score for companies matched by: +# full cusip and passing name_ratio +# or meeting date range requirement + +def score1(row): + if (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']) & (row['name_ratio'] >= name_ratio_p10): + score = 0 + elif (row['fdate']<=row['nameenddt']) & (row['ldate']>=row['namedt']): + score = 1 + elif row['name_ratio'] >= name_ratio_p10: + score = 2 + else: + score = 3 + return score + +# assign size portfolio +_link1_2['score']=_link1_2.apply(score1, axis=1) +_link1_2 = _link1_2[['ticker','permno','cname','comnam','name_ratio','score']] +_link1_2 = _link1_2.drop_duplicates() + +########################## +# Step 2: Link by TICKER # +########################## + +# Find links for the remaining unmatched cases using Exchange Ticker + +# Identify remaining unmatched cases +_nomatch1 = pd.merge(_ibes2[['ticker']], _link1_2[['permno','ticker']], on='ticker', how='left') +_nomatch1 = _nomatch1.loc[_nomatch1.permno.isnull()].drop(['permno'], axis=1).drop_duplicates() + +# Add IBES identifying information + +ibesid = conn.raw_sql(""" select ticker, cname, oftic, sdates, cusip from ibes.id """) +ibesid = ibesid.loc[ibesid.oftic.notna()] + +_nomatch2 = pd.merge(_nomatch1, ibesid, how='inner', on=['ticker']) + +# Create first and last 'start dates' for Exchange Tickers +# Label date range variables and keep only most recent company name + +_nomatch3 = _nomatch2.groupby(['ticker', 'oftic']).sdates.agg(['min', 'max'])\ +.reset_index().rename(columns={'min':'fdate', 'max':'ldate'}) + +_nomatch3 = pd.merge(_nomatch2, _nomatch3, how='left', on=['ticker','oftic']) + +_nomatch3 = _nomatch3.loc[_nomatch3.sdates == _nomatch3.ldate] + +# Get entire list of CRSP stocks with Exchange Ticker information + +_crsp_n1 = conn.raw_sql(""" select ticker, comnam, permno, ncusip, namedt, nameenddt + from crsp.stocknames """) + +_crsp_n1 = _crsp_n1.loc[_crsp_n1.ticker.notna()].sort_values(by=['permno','ticker','namedt']) + +# Arrange effective dates for link by Exchange Ticker + +_crsp_n1_namedt = _crsp_n1.groupby(['permno','ticker']).namedt.min().reset_index().rename(columns={'min':'namedt'}) +_crsp_n1_nameenddt = _crsp_n1.groupby(['permno','ticker']).nameenddt.max().reset_index().rename(columns={'max':'nameenddt'}) + +_crsp_n1_dt = pd.merge(_crsp_n1_namedt, _crsp_n1_nameenddt, how = 'inner', on=['permno','ticker']) + +_crsp_n1 = _crsp_n1.rename(columns={'namedt': 'namedt_ind', 'nameenddt':'nameenddt_ind'}) + +_crsp_n2 = pd.merge(_crsp_n1, _crsp_n1_dt, how ='left', on = ['permno','ticker']) + +_crsp_n2 = _crsp_n2.rename(columns={'ticker':'crsp_ticker'}) +_crsp_n2 = _crsp_n2.loc[_crsp_n2.nameenddt_ind == _crsp_n2.nameenddt].drop(['namedt_ind', 'nameenddt_ind'], axis=1) + +# Merge remaining unmatched cases using Exchange Ticker +# Note: Use ticker date ranges as exchange tickers are reused overtime + +_link2_1 = pd.merge(_nomatch3, _crsp_n2, how='inner', left_on=['oftic'], right_on=['crsp_ticker']) +_link2_1 = _link2_1.loc[(_link2_1.ldate>=_link2_1.namedt) & (_link2_1.fdate<=_link2_1.nameenddt)] + + +# Score using company name using 6-digit CUSIP and company name spelling distance +_link2_1['name_ratio'] = _link2_1.apply(lambda x: fuzz.token_set_ratio(x.comnam, x.cname), axis=1) + +_link2_2 = _link2_1 +_link2_2['cusip6'] = _link2_2.apply(lambda x: x.cusip[:6], axis=1) +_link2_2['ncusip6'] = _link2_2.apply(lambda x: x.ncusip[:6], axis=1) + +# Score using company name using 6-digit CUSIP and company name spelling distance + +def score2(row): + if (row['cusip6']==row['ncusip6']) & (row['name_ratio'] >= name_ratio_p10): + score = 0 + elif (row['cusip6']==row['ncusip6']): + score = 4 + elif row['name_ratio'] >= name_ratio_p10: + score = 5 + else: + score = 6 + return score + +# assign size portfolio +_link2_2['score']=_link2_2.apply(score2, axis=1) + +# Some companies may have more than one TICKER-PERMNO link +# so re-sort and keep the case (PERMNO & Company name from CRSP) +# that gives the lowest score for each IBES TICKER + +_link2_2 = _link2_2[['ticker','permno','cname','comnam', 'name_ratio', 'score']].sort_values(by=['ticker','score']) +_link2_2_score = _link2_2.groupby(['ticker']).score.min().reset_index() + +_link2_3 = pd.merge(_link2_2, _link2_2_score, how='inner', on=['ticker', 'score']) +_link2_3 = _link2_3[['ticker','permno','cname','comnam','score']].drop_duplicates() + +##################################### +# Step 3: Finalize LInks and Scores # +##################################### +# Combine the output from both linking procedures. Store the output data for future usage + +iclink = _link1_2.append(_link2_3) + +# Storing iclink for other program usage +import pickle as pkl + +with open('iclink.pkl', 'wb') as f: + pkl.dump(iclink, f) \ No newline at end of file diff --git a/pychars/impute_rank_output.py b/pychars/impute_rank_output.py new file mode 100755 index 0000000..5940b64 --- /dev/null +++ b/pychars/impute_rank_output.py @@ -0,0 +1,114 @@ +import pandas as pd +import pickle as pkl +import numpy as np +import wrds +from functions import * + +#################### +# All Stocks # +#################### + +with open('chars_a.pkl', 'rb') as f: + chars_a = pkl.load(f) + +chars_a = chars_a.dropna(subset=['permno']) +chars_a[['permno', 'gvkey']] = chars_a[['permno', 'gvkey']].astype(int) +chars_a['jdate'] = pd.to_datetime(chars_a['jdate']) +chars_a = chars_a.drop_duplicates(['permno', 'jdate']) + +with open('chars_q_raw.pkl', 'rb') as f: + chars_q = pkl.load(f) + +# use annual variables to fill na of quarterly variables +chars_q = fillna_atq(df_q=chars_q, df_a=chars_a) + +# adm is annual variable +adm = chars_a[['permno', 'jdate', 'adm']] +chars_q = pd.merge(chars_q, adm, how='left', on=['permno', 'jdate']) + +# impute missing values, you can choose different func form functions, such as ffi49/ffi10 +chars_q_impute = chars_q.copy() +chars_q_impute['sic'] = chars_q_impute['sic'].astype(int) +chars_q_impute['jdate'] = pd.to_datetime(chars_q_impute['jdate']) + +chars_q_impute['ffi49'] = ffi49(chars_q_impute) +chars_q_impute['ffi49'] = chars_q_impute['ffi49'].fillna(49) # we treat na in ffi49 as 'other' +chars_q_impute['ffi49'] = chars_q_impute['ffi49'].astype(int) + +# there are two ways to impute: industrial median or mean +chars_q_impute = fillna_ind(chars_q_impute, method='median', ffi=49) +# we use all stocks' mean or median to fill na that are not filled by value of ffi +chars_q_impute = fillna_all(chars_q_impute, method='median') +chars_q_impute['re'] = chars_q_impute['re'].fillna(0) # re use IBES database, there are lots of missing data + +chars_q_impute['year'] = chars_q_impute['jdate'].dt.year +chars_q_impute = chars_q_impute[chars_q_impute['year'] >= 1972] +chars_q_impute = chars_q_impute.drop(['year'], axis=1) + +with open('chars_q_impute.pkl', 'wb') as f: + pkl.dump(chars_impute, f, protocol=4) + +# standardize characteristics +chars_q_rank = standardize(chars_q) +chars_q_rank['year'] = chars_q_rank['jdate'].dt.year +chars_q_rank = chars_q_rank[chars_q_rank['year'] >= 1972] +chars_q_rank = chars_q_rank.drop(['year'], axis=1) + +with open('chars_q_rank.pkl', 'wb') as f: + pkl.dump(chars_rank, f, protocol=4) + +#################### +# SP1500 # +#################### +conn = wrds.Connection() + +# prepare S&P 1500 version, gvkeyx for sp600: 030824,for sp400: 024248,for sp500: 000003 +sp1500_index = conn.raw_sql('select * from comp.idxcst_his') +sp1500_index = sp1500_index[(sp1500_index['gvkeyx'] == '000003') | (sp1500_index['gvkeyx'] == '024248') + | (sp1500_index['gvkeyx'] == '030824')] + +sp1500_index = sp1500_index[['gvkey', 'from', 'thru']] +sp1500_index['gvkey'] = sp1500_index['gvkey'].astype(int) +sp1500_index['from'] = pd.to_datetime(sp1500_index['from']) +sp1500_index['thru'] = pd.to_datetime(sp1500_index['thru']) +sp1500_index['thru'] = sp1500_index['thru'].fillna(pd.to_datetime('today')) + +chars_q = pd.merge(chars_q, sp1500_index, how='left', on=['gvkey']) +sp1500 = chars_q.dropna(subset=['from'], axis=0) +sp1500 = sp1500[(sp1500['jdate'] >= sp1500['from']) & (sp1500['jdate'] <= sp1500['thru'])] +sp1500 = sp1500.drop(['from', 'thru'], axis=1) +sp1500 = sp1500.drop_duplicates(['gvkey', 'jdate']) + +# for test +# test = sp1500.groupby(['jdate'])['gvkey'].nunique() + +# impute missing values, you can choose different func form functions, such as ffi49/ffi10 +sp1500_impute = sp1500.copy() +sp1500_impute['sic'] = sp1500_impute['sic'].astype(int) +sp1500_impute['jdate'] = pd.to_datetime(sp1500_impute['jdate']) + +sp1500_impute['ffi49'] = ffi49(sp1500_impute) +sp1500_impute['ffi49'] = sp1500_impute['ffi49'].fillna(49) # we treat na in ffi49 as 'other' +sp1500_impute['ffi49'] = sp1500_impute['ffi49'].astype(int) + +# there are two ways to impute: industrial median or mean +sp1500_impute = fillna_ind(sp1500_impute, method='median', ffi=49) +# we use all stocks' mean or median to fill na that are not filled by value of ffi +sp1500_impute = fillna_all(sp1500_impute, method='median') +sp1500_impute['re'] = sp1500_impute['re'].fillna(0) # re use IBES database, there are lots of missing data + +sp1500_impute['year'] = sp1500_impute['jdate'].dt.year +sp1500_impute = sp1500_impute[sp1500_impute['year'] >= 1972] +sp1500_impute = sp1500_impute.drop(['year'], axis=1) + +with open('sp1500_impute.pkl', 'wb') as f: + pkl.dump(sp1500_impute, f, protocol=4) + +# standardize characteristics +sp1500_rank = standardize(sp1500) +sp1500_rank['year'] = sp1500_rank['jdate'].dt.year +sp1500_rank = sp1500_rank[sp1500_rank['year'] >= 1972] +sp1500_rank = sp1500_rank.drop(['year'], axis=1) + +with open('sp1500_rank.pkl', 'wb') as f: + pkl.dump(sp1500_rank, f, protocol=4) \ No newline at end of file diff --git a/pychars/merge_chars.py b/pychars/merge_chars.py new file mode 100755 index 0000000..e042a3d --- /dev/null +++ b/pychars/merge_chars.py @@ -0,0 +1,86 @@ +import pandas as pd +import pickle as pkl +from pandas.tseries.offsets import * +import wrds + +with open('chars_q.pkl', 'rb') as f: + chars_q = pkl.load(f) + +chars_q = chars_q.dropna(subset=['permno']) +chars_q[['permno', 'gvkey']] = chars_q[['permno', 'gvkey']].astype(int) +chars_q['jdate'] = pd.to_datetime(chars_q['jdate']) +chars_q = chars_q.drop_duplicates(['permno', 'jdate']) + +with open('beta.pkl', 'rb') as f: + beta = pkl.load(f) + +beta['permno'] = beta['permno'].astype(int) +beta['jdate'] = pd.to_datetime(beta['date']) + MonthEnd(0) +beta = beta[['permno', 'jdate', 'beta']] +beta = beta.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, beta, how='left', on=['permno', 'jdate']) + +with open('rvar_capm.pkl', 'rb') as f: + rvar_capm = pkl.load(f) + +rvar_capm['permno'] = rvar_capm['permno'].astype(int) +rvar_capm['jdate'] = pd.to_datetime(rvar_capm['date']) + MonthEnd(0) +rvar_capm = rvar_capm[['permno', 'jdate', 'rvar_capm']] +rvar_capm = rvar_capm.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, rvar_capm, how='left', on=['permno', 'jdate']) + +with open('rvar_mean.pkl', 'rb') as f: + rvar_mean = pkl.load(f) + +rvar_mean['permno'] = rvar_mean['permno'].astype(int) +rvar_mean['jdate'] = pd.to_datetime(rvar_mean['date']) + MonthEnd(0) +rvar_mean = rvar_mean[['permno', 'jdate', 'rvar_mean']] +rvar_mean = rvar_mean.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, rvar_mean, how='left', on=['permno', 'jdate']) + +with open('rvar_ff3.pkl', 'rb') as f: + rvar_ff3 = pkl.load(f) + +rvar_ff3['permno'] = rvar_ff3['permno'].astype(int) +rvar_ff3['jdate'] = pd.to_datetime(rvar_ff3['date']) + MonthEnd(0) +rvar_ff3 = rvar_ff3[['permno', 'jdate', 'rvar_ff3']] +rvar_ff3 = rvar_ff3.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, rvar_ff3, how='left', on=['permno', 'jdate']) + +with open('sue.pkl', 'rb') as f: + sue = pkl.load(f) + +sue['permno'] = sue['permno'].astype(int) +sue['jdate'] = pd.to_datetime(sue['date']) + MonthEnd(0) +sue = sue[['permno', 'jdate', 'sue']] +sue = sue.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, sue, how='left', on=['permno', 'jdate']) + +with open('re.pkl', 'rb') as f: + re = pkl.load(f) + +re['permno'] = re['permno'].astype(int) +re['jdate'] = pd.to_datetime(re['date']) + MonthEnd(0) +re = re[['permno', 'jdate', 're']] +re = re.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, re, how='left', on=['permno', 'jdate']) + +with open('abr.pkl', 'rb') as f: + abr = pkl.load(f) + +abr['permno'] = abr['permno'].astype(int) +abr['jdate'] = pd.to_datetime(abr['date']) + MonthEnd(0) +abr = abr[['permno', 'jdate', 'abr']] +abr = abr.drop_duplicates(['permno', 'jdate']) + +chars_q = pd.merge(chars_q, abr, how='left', on=['permno', 'jdate']) + +# save data +with open('chars_q_raw.pkl', 'wb') as f: + pkl.dump(chars_q, f, protocol=4) \ No newline at end of file diff --git a/pychars/rvar_capm.py b/pychars/rvar_capm.py new file mode 100755 index 0000000..fa3a01c --- /dev/null +++ b/pychars/rvar_capm.py @@ -0,0 +1,168 @@ +# CAPM residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf + from crsp.dsf as a + left join ff.factors_daily as b + on a.date=b.date + where a.date >= '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate residual # +###################### + + +def get_res_var(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + # if observations in last 3 months are less 21, we drop the rvar of this month + if temp['permno'].count() < 21: + pass + else: + rolling_window = temp['permno'].count() + index = temp.tail(1).index + X = pd.DataFrame() + X[['mktrf']] = temp[['mktrf']] + X['intercept'] = 1 + X = X[['intercept', 'mktrf']] + X = np.mat(X) + Y = np.mat(temp[['exret']]) + res = (np.identity(rolling_window) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + res_var = res.var(ddof=1) + df.loc[index, 'rvar'] = res_var + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_res_var, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling +crsp = crsp.rename(columns={'rvar': 'rvar_capm'}) +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'rvar_capm']] + +with open('rvar_capm.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/pychars/rvar_ff3.py b/pychars/rvar_ff3.py new file mode 100755 index 0000000..36561a0 --- /dev/null +++ b/pychars/rvar_ff3.py @@ -0,0 +1,201 @@ +# Fama & French 3 factors residual variance +# Note: Please use the latest version of pandas, this version should support returning to pd.Series after rolling +# To get a faster speed, we split the big dataframe into small ones +# Then using different process to calculate the variance +# We use 20 process to calculate variance, you can change the number of process according to your CPU situation +# You can use the following code to check your CPU situation +# import multiprocessing +# multiprocessing.cpu_count() + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select a.permno, a.date, a.ret, (a.ret - b.rf) as exret, b.mktrf, b.smb, b.hml + from crsp.dsf as a + left join ff.factors_daily as b + on a.date=b.date + where a.date > '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate the beta # +###################### +# function that get multiple beta +'''' +rolling_window = 60 # 60 trading days +crsp['beta_mktrf'] = np.nan +crsp['beta_smb'] = np.nan +crsp['beta_hml'] = np.nan + + +def get_beta(df): + """ + The original idea of calculate beta is using formula (X'MX)^(-1)X'MY, + where M = I - 1(1'1)^{-1}1, I is a identity matrix. + + """ + temp = crsp.loc[df.index] # extract the rolling sub dataframe from original dataframe + X = np.mat(temp[['mktrf', 'smb', 'hml']]) + Y = np.mat(temp[['exret']]) + ones = np.mat(np.ones(rolling_window)).T + M = np.identity(rolling_window) - ones.dot((ones.T.dot(ones)).I).dot(ones.T) + beta = (X.T.dot(M).dot(X)).I.dot((X.T.dot(M).dot(Y))) + crsp['beta_mktrf'].loc[df.index[-1:]] = beta[0] + crsp['beta_smb'].loc[df.index[-1:]] = beta[1] + crsp['beta_hml'].loc[df.index[-1:]] = beta[2] + return 0 # we do not need the rolling outcome since rolling cannot return different values in different columns + + +# calculate beta through rolling window +crsp_temp = crsp.groupby('permno').rolling(rolling_window).apply(get_beta, raw=False) +''' + +###################### +# Calculate residual # +###################### + + +def get_res_var(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + # if observations in last 3 months are less 21, we drop the rvar of this month + if temp['permno'].count() < 21: + pass + else: + rolling_window = temp['permno'].count() + index = temp.tail(1).index + X = pd.DataFrame() + X[['mktrf', 'smb', 'hml']] = temp[['mktrf', 'smb', 'hml']] + X['intercept'] = 1 + X = X[['intercept', 'mktrf', 'smb', 'hml']] + X = np.mat(X) + Y = np.mat(temp[['exret']]) + res = (np.identity(rolling_window) - X.dot(X.T.dot(X).I).dot(X.T)).dot(Y) + res_var = res.var(ddof=1) + df.loc[index, 'rvar'] = res_var + return df + + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_res_var, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling +crsp = crsp.rename(columns={'rvar': 'rvar_ff3'}) +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'rvar_ff3']] + +with open('rvar_ff3.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/pychars/rvar_mean.py b/pychars/rvar_mean.py new file mode 100755 index 0000000..42297f4 --- /dev/null +++ b/pychars/rvar_mean.py @@ -0,0 +1,150 @@ +# RVAR mean + +import pandas as pd +import numpy as np +import datetime as dt +import wrds +from dateutil.relativedelta import * +from pandas.tseries.offsets import * +import datetime +import pickle as pkl +import multiprocessing as mp + +################### +# Connect to WRDS # +################### +conn = wrds.Connection() + +# CRSP Block +crsp = conn.raw_sql(""" + select permno, date, ret + from crsp.dsf + where date >= '01/01/1959' + """) + +# sort variables by permno and date +crsp = crsp.sort_values(by=['permno', 'date']) + +# change variable format to int +crsp['permno'] = crsp['permno'].astype(int) + +# Line up date to be end of month +crsp['date'] = pd.to_datetime(crsp['date']) + +# find the closest trading day to the end of the month +crsp['monthend'] = crsp['date'] + MonthEnd(0) +crsp['date_diff'] = crsp['monthend'] - crsp['date'] +date_temp = crsp.groupby(['permno', 'monthend'])['date_diff'].min() +date_temp = pd.DataFrame(date_temp) # convert Series to DataFrame +date_temp.reset_index(inplace=True) +date_temp.rename(columns={'date_diff': 'min_diff'}, inplace=True) +crsp = pd.merge(crsp, date_temp, how='left', on=['permno', 'monthend']) +crsp['sig'] = np.where(crsp['date_diff'] == crsp['min_diff'], 1, np.nan) + +# label every date of month end +crsp['month_count'] = crsp[crsp['sig'] == 1].groupby(['permno']).cumcount() + +# label numbers of months for a firm +month_num = crsp[crsp['sig'] == 1].groupby(['permno'])['month_count'].tail(1) +month_num = month_num.astype(int) +month_num = month_num.reset_index(drop=True) + +# mark the number of each month to each day of this month +crsp['month_count'] = crsp.groupby(['permno'])['month_count'].fillna(method='bfill') + +# crate a firm list +df_firm = crsp.drop_duplicates(['permno']) +df_firm = df_firm[['permno']] +df_firm['permno'] = df_firm['permno'].astype(int) +df_firm = df_firm.reset_index(drop=True) +df_firm = df_firm.reset_index() +df_firm = df_firm.rename(columns={'index': 'count'}) +df_firm['month_num'] = month_num + +###################### +# Calculate variance # +###################### + + +def get_ret_var(df, firm_list): + """ + + :param df: stock dataframe + :param firm_list: list of firms matching stock dataframe + :return: dataframe with variance of residual + """ + for firm, count, prog in zip(firm_list['permno'], firm_list['month_num'], range(firm_list['permno'].count()+1)): + prog = prog + 1 + print('processing permno %s' % firm, '/', 'finished', '%.2f%%' % ((prog/firm_list['permno'].count())*100)) + for i in range(count + 1): + # if you want to change the rolling window, please change here: i - 2 means 3 months is a window. + temp = df[(df['permno'] == firm) & (i - 2 <= df['month_count']) & (df['month_count'] <= i)] + # if observations in last 3 months are less 21, we drop the rvar of this month + if temp['permno'].count() < 21: + pass + else: + index = temp.tail(1).index + ret_var = temp['ret'].var() + df.loc[index, 'rvar'] = ret_var + return df + +def sub_df(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dictionary including all the 'firm_list' dataframe and 'stock data' dataframe + """ + # we use dict to store different sub dataframe + temp = {} + for i, h in zip(np.arange(start, end, step), range(int((end-start)/step))): + print('processing splitting dataframe:', round(i, 2), 'to', round(i + step, 2)) + if i == 0: # to get the left point + temp['firm' + str(h)] = df_firm[df_firm['count'] <= df_firm['count'].quantile(i + step)] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + else: + temp['firm' + str(h)] = df_firm[(df_firm['count'].quantile(i) < df_firm['count']) & ( + df_firm['count'] <= df_firm['count'].quantile(i + step))] + temp['crsp' + str(h)] = pd.merge(crsp, temp['firm' + str(h)], how='left', + on='permno').dropna(subset=['count']) + return temp + + +def main(start, end, step): + """ + + :param start: the quantile to start cutting, usually it should be 0 + :param end: the quantile to end cutting, usually it should be 1 + :param step: quantile step + :return: a dataframe with calculated variance of residual + """ + df = sub_df(start, end, step) + pool = mp.Pool() + p_dict = {} + for i in range(int((end-start)/step)): + p_dict['p' + str(i)] = pool.apply_async(get_ret_var, (df['crsp%s' % i], df['firm%s' % i],)) + pool.close() + pool.join() + result = pd.DataFrame() + print('processing pd.concat') + for h in range(int((end-start)/step)): + result = pd.concat([result, p_dict['p%s' % h].get()]) + return result + + +# calculate variance of residual through rolling window +# Note: please split dataframe according to your CPU situation. For example, we split dataframe to (1-0)/0.05 = 20 sub +# dataframes here, so the function will use 20 cores to calculate variance of residual. +if __name__ == '__main__': + crsp = main(0, 1, 0.05) + +# process dataframe +crsp = crsp.dropna(subset=['rvar']) # drop NA due to rolling +crsp = crsp.rename(columns={'rvar': 'rvar_mean'}) +crsp = crsp.reset_index(drop=True) +crsp = crsp[['permno', 'date', 'rvar_mean']] + +with open('rvar_mean.pkl', 'wb') as f: + pkl.dump(crsp, f) \ No newline at end of file diff --git a/qsub/check_crsp.sas b/qsub/check_crsp.sas new file mode 100755 index 0000000..5b30574 --- /dev/null +++ b/qsub/check_crsp.sas @@ -0,0 +1,2 @@ +proc contents data=crsp.dsf; +run; diff --git a/qsub/submit.sh b/qsub/submit.sh new file mode 100755 index 0000000..1c552fb --- /dev/null +++ b/qsub/submit.sh @@ -0,0 +1,11 @@ +#!/bin/bash +#$ -cwd +#$ -m abe +#$ -M xinhe9715@126.com +#R CMD BATCH my_program.r my_program.Output +#python3 PyProgram.py &> PyProgram.out +sas check_crsp.sas + +## if you need to add cpu and memory +##$ -pe onenode 8 +##$ -l m_mem_free=6G diff --git a/setup-wrds.py b/setup-wrds.py new file mode 100755 index 0000000..70d48d6 --- /dev/null +++ b/setup-wrds.py @@ -0,0 +1,11 @@ +# set up the .pgpass file +# then you don't need to type in password +import wrds +db = wrds.Connection(wrds_username='xinhe97') +# Enter your WRDS username [joe]: +# Enter your password: +db.create_pgpass_file() +db.close() +# check again +db = wrds.Connection(wrds_username='xinhe97') +db.close() From 0c92efbeda78b77f0b44f39d8d615766ffb8f74b Mon Sep 17 00:00:00 2001 From: velonisa Date: Sat, 6 Mar 2021 10:08:14 +0800 Subject: [PATCH 15/15] update 0306 --- char60/accounting_60.py | 0 char60/bid_ask_spread.py | 0 char60/functions.py | 0 char60/ill.py | 0 char60/impute_rank_output_bchmk_60.py | 0 char60/maxret_d.py | 0 char60/merge_chars_60.py | 0 char60/pkl_to_csv.py | 0 char60/split_csv.py | 38 +++++++++++++++++++++++++++ char60/std_dolvol.py | 0 char60/std_turn.py | 0 char60/zerotrade.py | 0 12 files changed, 38 insertions(+) mode change 100755 => 100644 char60/accounting_60.py mode change 100755 => 100644 char60/bid_ask_spread.py mode change 100755 => 100644 char60/functions.py mode change 100755 => 100644 char60/ill.py mode change 100755 => 100644 char60/impute_rank_output_bchmk_60.py mode change 100755 => 100644 char60/maxret_d.py mode change 100755 => 100644 char60/merge_chars_60.py mode change 100755 => 100644 char60/pkl_to_csv.py create mode 100644 char60/split_csv.py mode change 100755 => 100644 char60/std_dolvol.py mode change 100755 => 100644 char60/std_turn.py mode change 100755 => 100644 char60/zerotrade.py diff --git a/char60/accounting_60.py b/char60/accounting_60.py old mode 100755 new mode 100644 diff --git a/char60/bid_ask_spread.py b/char60/bid_ask_spread.py old mode 100755 new mode 100644 diff --git a/char60/functions.py b/char60/functions.py old mode 100755 new mode 100644 diff --git a/char60/ill.py b/char60/ill.py old mode 100755 new mode 100644 diff --git a/char60/impute_rank_output_bchmk_60.py b/char60/impute_rank_output_bchmk_60.py old mode 100755 new mode 100644 diff --git a/char60/maxret_d.py b/char60/maxret_d.py old mode 100755 new mode 100644 diff --git a/char60/merge_chars_60.py b/char60/merge_chars_60.py old mode 100755 new mode 100644 diff --git a/char60/pkl_to_csv.py b/char60/pkl_to_csv.py old mode 100755 new mode 100644 diff --git a/char60/split_csv.py b/char60/split_csv.py new file mode 100644 index 0000000..c11fafe --- /dev/null +++ b/char60/split_csv.py @@ -0,0 +1,38 @@ +import pickle as pkl +import pandas as pd + +# with open('/Users/eric/WeDrive/Feng-CityUHK/Data/pychars/chars60/raw/chars_impute_60.pkl', 'rb') as f: +# chars = pkl.load(f) + +with open('/Users/eric/WeDrive/Feng-CityUHK/Data/pychars/chars60/rank/chars_rank_60.pkl', 'rb') as f: + chars = pkl.load(f) + +# chars = chars[['permno', 'gvkey', 'datadate', 'jdate', 'ffi49', 'sic', 'exchcd', 'shrcd', 'ret', 'retx', 'retadj', +# 'rank_abr', 'rank_acc', 'rank_adm', 'rank_agr', 'rank_alm', 'rank_ato', 'rank_baspread', 'rank_beta', +# 'rank_bm', 'rank_bm_ia', 'rank_cash', 'rank_cashdebt', 'rank_cfp', +# 'rank_chcsho', 'rank_chpm', 'rank_chtx', 'rank_cinvest', 'rank_depr', 'rank_dolvol', 'rank_dy', +# 'rank_ep', 'rank_gma', 'rank_grltnoa', 'rank_herf', 'rank_hire', +# 'rank_ill', 'rank_lev', 'rank_lgr', 'rank_maxret', 'rank_me_ia', 'rank_mom12m', 'rank_mom1m', +# 'rank_mom36m', 'rank_mom60m', 'rank_mom6m', 'rank_ni', 'rank_nincr', +# 'rank_noa', 'rank_op', 'rank_pctacc', 'rank_pm', 'rank_pscore', 'rank_rd_sale', 'rank_rdm', 'rank_re', +# 'rank_rna', 'rank_roa', 'rank_roe', 'rank_rsup', 'rank_rvar_capm', +# 'rank_rvar_ff3', 'rank_rvar_mean', 'rank_seas1a', 'rank_sgr', 'rank_sp', 'rank_std_dolvol', +# 'rank_std_turn', 'rank_sue', 'rank_turn', 'rank_zerotrade']] + +print(chars.columns.values) + +chars['jdate'] = pd.to_datetime(chars['jdate']) +chars['year'] = chars['jdate'].dt.year +chars_1970s = chars[chars['year'] < 1980] +chars_1980s = chars[(chars['year'] >= 1980) & (chars['year'] < 1990)] +chars_1990s = chars[(chars['year'] >= 1990) & (chars['year'] < 2000)] +chars_2000s = chars[(chars['year'] >= 1990) & (chars['year'] < 2010)] +chars_2010s = chars[(chars['year'] >= 2000) & (chars['year'] < 2020)] + +chars_1970s.to_csv('chars60_rank_1970s.csv', index=0) +chars_1980s.to_csv('chars60_rank_1980s.csv', index=0) +chars_1990s.to_csv('chars60_rank_1990s.csv', index=0) +chars_2000s.to_csv('chars60_rank_2000s.csv', index=0) +chars_2010s.to_csv('chars60_rank_2010s.csv', index=0) +# +# print(chars_2010s[chars_2010s['permno']==14593][['permno', 'jdate']]) diff --git a/char60/std_dolvol.py b/char60/std_dolvol.py old mode 100755 new mode 100644 diff --git a/char60/std_turn.py b/char60/std_turn.py old mode 100755 new mode 100644 diff --git a/char60/zerotrade.py b/char60/zerotrade.py old mode 100755 new mode 100644