/*******************************************************************************************/ /* Program : ARC.sas */ /* Authors : Rani Hoitash; Udi Hoitash */ /* Date Created : March 2022 */ /* */ /* */ /* */ /* Description: Construct Accounting Reporting Complexity (ARC), and */ /* several other correlated measures using SEC data. */ /* */ /* */ /* */ /* The code demonstrates the creation of ARC using one calendar quarter of SEC filings. */ /* Users that wish to process more than one quarter/month of filings will need to create */ /* a Macro */ /* */ /* To obtain the data used to create ARC go to: */ /* https://www.sec.gov/dera/data/financial-statement-and-notes-data-set.html */ /* Download and extract the zip files for the period you wish to investigate. At this */ /* point (04/01/2022) there are over 50 files. Zip files are available for each */ /* quarter until Q3 2020 and for each month thereafter (The SEC switched permanently to */ /* monthly aggregation on 10/2020). There is no difference in the structure of the */ /* quarterly and */ /* monthly files. You will need to import into SAS each of the five file types */ /* (defined below). To best use the Macro, you should name each file with a */ /* suffix that captures the period. For example, the SUB file for Q3 2020 should be */ /* imported as "sub_q32020", the SUB file for 10/2020 should be saved as sub_m102020, etc.*/ /* the NUM file for Q3 2020 should be imported as "num_q32020". */ /* In this way, you can later call the Macro and insert the "period=" as "_q32020" */ /* or "_m102020" etc. */ /* */ /* The ARC data is frequently updated. To download the data directly, instead of running */ /* the code, go to www.xbrlresearch.com. This data source can also be used to validate */ /* your code. */ /* */ /* The code uses the following Five TSV file types from the Zip file: */ /* */ /* 1) "NUM" (numbers), which contains all numeric XBRL data as filed. */ /* 2) "TAG" (Tags), which contains standard Taxonomy tags and Extended (firm-created) */ /* custom tags defined in the submissions (extensions); */ /* 3) "PRE" (Presentation), which contains one row for each line of the financial */ /* statements tagged by the filer. */ /* 4) "TXT" (Plain Text), which contains non-numeric XBRL tags(e.g., text) data. */ /* 5) "SUB"(Submissions),which contains a summary about an entire EDGAR submission. */ /* */ /* Currently, the code DOES NOT use the following three TSV file types from the Zip file:*/ /* */ /* 6) "REN" (Rendering), which summarizes for each filing the data provided by filers */ /* about each presentation group. */ /* 7) "CAL" (Calculation), provides information to arithmetically relate tags in a filing.*/ /* 8) "DIM"(Dimension), contains XBRL dimensional tags,(e.g. segments, fair value level) */ /* */ /********************************************************************************************/ * Define a library called SEC that points to the location on your computer of the SAS files you imported. For example, if all the files are saved under the C drive in the XBRL folder, the following pass is used "libname sec 'C:\XBRL' Remember, the final processed data will also be saved to the same folder; libname sec 'C:\XBRL'; /*Start Macro. The Macro accepts one value, 'period', representing the quarter or month, e.g. "period=_q32020". At the bottom of the code we illustrate how to invoke the Macro to estimate ARC for two periods, _q32020 and _m102020 you will need to run the Macro code below including invoking the Macro at the end of this code*/ %macro run_arc (period=); PROC DATASETS LIB=work MEMTYPE=data kill; dm "out;clear;log;clear;"; * Start with the "NUM" (numbers) file and Merge with the "TAG" (Tags) * ARC only captures monetary tags that we identify using the field "datatype" in the dataset. * We also differentiate between Taxonomy and Extended(i.e., firm-created) tags using the "Custom" variable; * The code below merges these variables; * Version- The version of the taxonomy, which is updated annually by the FASB. If the tag is custom, the version accepts the value of the accession number.; proc sql; create table num as select a.*, b.datatype, b.custom, 1 as dummy from sec.num&period as a left join sec.tag&period as b on a.tag=b.tag and a.version=b.version; quit; * Calculate the total number of numerical disclosures * The primary key is Accession Number (ADSH). The 20-character string formed from the 18-digit number assigned by the SEC to each EDGAR submission.; proc sql; create table count_all_numbers as select adsh, count(dummy) as ARC_ALL_NUMS label="A count of all XBRL numbers", sum(custom) as ARC_ALL_NUMS_EXTENSIONS label="A count of all Numeric XBRL extended tags", mean(custom) as ARC_ALL_NUMS_PCT_EXTENDED label="Percent of extended tags of all XBRL numbers" from num group by adsh; quit; * ARC is based on monetary tags only, so the next procedure keeps only monetary tags. Note, numeric disclosures can include other amounts, such as percent or shares; data nummonetary; set num; where datatype="monetary"; run; * Count all monetary numbers(facts)in each submission; proc sql; create table count_all_monetary_numbers as select adsh, count(dummy) as ARC_ALL_MONETARY "A count of all XBRL monetary numbers", sum(custom) as ARC_ALL_MONETARY_EXTENSIONS label="A count of all Numeric XBRL extended tags", mean(custom) as ARC_ALL_MONETARY_PCT_EXTENDED label="Percent of extended tags of all XBRL monetary numbers" from nummonetary group by adsh; quit; /*Please note that tags repeat for several reasons (1) The same tag is reported over several periods (e.g., "Revenues" for 2020 and 2021) (2) The same tag is reported for the same period, but for a different dimension, e.g., goodwill for two separate acquisitions, or revenue for two different segments. Hence, tags can repeat even within a footnote, or a statement. ARC assumes that preparing and consuming the same tag does not change complexity. Nevertheless, the code also generates the total count of numbers without removing duplications. As reported in Hoitash and Hoitash (2018), results are similar when either measure is used. */ * Remove repeating tags by accession and in each report (e.g., table, note, statement) To achieve this we use the "PRE" (Presentation).Note that there may be more than one row in "PRE" per one entry in "NUM" because the same tag can appear in more than one statement (e.g., the tag "NetIncome", can appear in both the "Income Statement" and the "Cash Flows" statement, and the tag "Cash" may appear in both the "Balance Sheet" and statement of "Cash Flows").; proc sort nodupkey data=sec.pre&period out=pre; by adsh report tag; run; * Next, add all monetary numbers to the PRE file, as expected, this results in duplicates; proc sql; create table prenum as select a.*, b.* from nummonetary as a left join pre b on a.adsh=b.adsh and a.tag=b.tag; quit; * Next, use data in the PRE file to classify tags into specific financial statements. Tags NOT in any financial statement are classified into the financial statement notes; *(CP = Cover Page, BS = Balance Sheet, IS = Income Statement, CF = Cash Flow, EQ = Equity, CI = Comprehensive Income, UN = Unclassifiable Statement).; data prenum; set prenum; * Face is defined for tags in the financial statements=1, or for notes face=0; if stmt in ('BS' 'IS' 'CF' 'EQ' 'CI' 'UN') then face=1; else face=0; if face=0 then notes=1; else notes=0; if stmt="BS" then BS=1; else BS=0; if stmt="CF" then CF=1; else CF=0; if stmt="EQ" then EQ=1; else EQ=0; if stmt="IS" or stmt="CI" then IS=1; else IS=0; if stmt="UN" then UN=1; else UN=0; run; /*Because ARC captures unique tags within a statement/footnote/table, we remove duplicates. Recall, per ARC, tags can repeat, but not within the same disclosure */ proc sort nodupkey data=prenum; by adsh report tag; quit; * Aggregate several counts to construct: (1) ARC (2)number of extended (custom) tags (3) percent of extended tags (4) ARC in financial statements and notes (5) ARC in each financial statement; proc sql; create table arcsec as select adsh, count(custom) as ARC label="Accounting Reporting Complexity", sum(custom) as ARC_EXTENSIONS "Accounting Reporting Complexity Extended Tags", mean(custom) as ARC_PCT_EXTENDED label="Accounting Reporting Complexity Percent of extended tags", sum(face) as ARC_FACE "Accounting Reporting Complexity Financial Statements", sum(is) as ARC_IS "Accounting Reporting Complexity Income Statement", sum(bs) as ARC_BS "Accounting Reporting Complexity Balance Sheet Statement", sum(cf) as ARC_CF "Accounting Reporting Complexity Cash Flow Statement", sum(eq) as ARC_EQ "Accounting Reporting Complexity Equity Statement", sum(un) as ARC_UN "Accounting Reporting Complexity Unclassifiable Statement", sum(notes) as ARC_NOTES "Accounting Reporting Complexity Financial Statement Notes", sum (case when face=1 and custom=1 then face else 0 end) as ARC_FACE_EXTENSIONS "Accounting Reporting Complexity Face Extensions", sum (case when notes=1 and custom=1 then notes else 0 end) as ARC_NOTES_EXTENSIONS "Accounting Reporting Complexity Notes Extensions" from prenum group by adsh; quit; * Merge several counts by accession (ADSH): 1)the count of all numeric tags 2) all monetary tags 3) ARC; proc sql; create table arcsec as select a.*, b.* from arcsec a left join count_all_monetary_numbers b on a.adsh=b.adsh; quit; * Next, merge the count for all nums (not only monetary); proc sql; create table arcsec as select a.*, b.* from arcsec a left join count_all_numbers b on a.adsh=b.adsh; quit; * Create a count of unique tags within each filing; * keep unique tags; proc sort nodupkey data=prenum out=uniquetags; by adsh tag; quit; * calculate the count of unique tags; proc sql; create table arcsecunique as select adsh, count(custom) as ARC_UNIQUE_MONETARY_TAGS label = "Number of Unique Tags in A filing", sum(custom) as ARC_UNIQUE_MONETARY_EXTENSIONS label = "Number of Unique Extended Tags in A filing", mean(custom) as ARC_UNIQUE_MONETARY_PCT_EXTENDED label="Percent of Unique extended Tags" from uniquetags group by adsh; quit; * merge the count of unique tags into the "arcsec" file; proc sql; create table arcsec as select a.*, b.* from arcsec a left join arcsecunique b on a.adsh=b.adsh; quit; * create a count of all non-numeric (i.e., text) tags from: "TXT" (Plain Text); proc sql; create table count_text_tags as select adsh, count(adsh) as ARC_ALL_TEXTBLOCK_TAGS label = "Number of Textblock Tags in A filing" from sec.txt&period group by adsh; quit; * Merge text XBRL tags to arcsec; proc sql; create table arcsec as select a.*, b.* , (a.ARC_ALL_NUMS+b.ARC_ALL_TEXTBLOCK_TAGS) as ARC_ALL_FACTS label = "A count of all XBRL facts" from arcsec a left join count_text_tags b on a.adsh=b.adsh; quit; * Add several variables from the "SUB" file that contain important information about each filing (ADSH). For example, the CIK, the form type (10-K, 10-Q), the period end etc.; proc sql; create table arcsec as select a.*, b.cik, b.sic, b.afs as filer_status, b.form, b.period, b.filed, b.accepted, b.nciks from arcsec a left join sec.sub&period b on a.adsh=b.adsh; quit; * Create a date for filing period end. This date, a long with "CIK", can be used to match the data with the Compustat based on the following two Compustat variables: DATADATE and CIK; data arcsec; set arcsec; period_end= input(put(period, 8.), yymmdd8.); format period_end yymmdd8.; run; * Keep only annual and quarterly filings, as well as any associated amended filings. You can modify the code below to keep other forms; data arcsec; set arcsec; IF INDEX(form,"10-K") or INDEX(form,"10-K/A") or INDEX(form,"10-Q") or INDEX(form,"10-Q/A")or INDEX(form,"20-F") or INDEX(form,"20-F/A") or INDEX(form,"40-F") or INDEX(form,"40-F/A") or INDEX(form,"6-K") or INDEX(form,"6-K/A") or INDEX(form,"NT") Then keep=1; else keep=0; run; *Keep the final data set containing ARC as well as other measures created above. See Appendix of paper for detailed variable definitions; * Each dataset is named with a suffix, representing the period; data sec.arc. set arcsec; where keep=1; run; %mend run_arc; /* This is where users invoke the Macro. The Macro is run for each period separately. To run the Macro for a certain period, you should add a row calling for the Macro with the period you are interested in. In the illustration below, we invoke the Macro for two periods, the third quarter of 2020 and October 2020. The output files are: Arc_q32020 and Arc_m102020 , i.e., the ARC measures for the periods that we ran the Macro on. To get ARC for all periods you need to separately run the Macro for each period and then combine all the output files together*/ %run_arc (period=_q32020); %run_arc (period=_m102020);