/* Version 1.0 Beta - CalculateSurvivalTimeInMonths.sas */ /* This SAS program is provided to calculate 4 fields which are described at */ /* http://seer.cancer.gov/survivaltime/ */ /* In addition to descriptions of the fields, you can find other documentation */ /* and contact information on the web page. */ /* Please note this is a Beta version. It is still being tested for accuracy. */ /* Addition comments will be added and the code will be reviewed for efficiency. */ filename in "c:\mydata\myinfile.txt"; filename out "c:\mydata\myoutputfile.txt"; /* Assumptions: */ /* study cutoff will be 12/31/xxxx */ /* no unknown years for dx or last contact */ /* setup to handle max of 49 tumors */ /* date of last contact is the same for all records for a patient */ /* works for cases with dx thru study cutoff date, not beyond */ %LET STUDYCUTOFFYEAR = 2010; %LET STUDYCUTOFFMONTH = 12; %LET STUDYCUTOFFDAY = 31; %LET DAYS_IN_MONTH = (365.24/12); /* First read in the data and sort it such that the records are in order of diagnosis. */ /* Can't just use sequence number, because sequence numbers 60+ are non-federally reportable */ /* and would all come after the federally reportable tumors regardless of date of dx. */ /* The sort we want is by date dx and then sequence number, so that if multiple tumors have */ /* the same date of diagnosis, we will use sequence number as the tie-breaker for the sort. */ /* In the event of ties, a federally reportable tumor will occur before the non-federally */ /* reportable tumor. */ data fullrec (keep = record order) fed (drop = record) nonfed (drop = record); infile in lrecl=3339; input @ 1 record $char3339. /* buffer to read full "incidence" file record - NAACCR type "I" */ @ 42 pat_id 8. /* NAACCRItemNumber 20 */ @ 145 state $char2. /* NAACCRItemNumber 80 */ @ 528 seq_num 2. /* NAACCRItemNumber 380 */ @ 530 year_dx 4. /* NAACCRItemNumber 390 */ @ 534 month_dx 2. /* NAACCRItemNumber 390 */ @ 536 day_dx 2. /* NAACCRItemNumber 390 */ @2116 year_lc 4. /* NAACCRItemNumber 1750 */ @2120 month_lc 2. /* NAACCRItemNumber 1750 */ @2122 day_lc 2. /* NAACCRItemNumber 1750 */ @2126 vit_stat 1. /* NAACCRItemNumber 1760 */ ; order = _n_; /* if date of last contact is beyond study cut-off, set to study cut-off */ if year_lc > &STUDYCUTOFFYEAR then do; year_lc = &STUDYCUTOFFYEAR; month_lc = &STUDYCUTOFFMONTH; day_lc = &STUDYCUTOFFDAY; end; if month_dx = . then day_dx = .; /* if month is missing, then treat day as missing even if it was not */ if month_lc = . then day_lc = .; /* retain original values, needed for 2nd set of variables (presumed alive version) and useful for inspecting changes */ o_month_dx = month_dx; o_month_lc = month_lc; o_day_dx = day_dx; o_day_lc = day_lc; o_year_lc = year_lc; if seq_num < 60 or seq_num >= 98 then output fed; else output nonfed; output fullrec; run; /* To insert the sequence # 60+ tumors in with the sequence # < 60 tumors, need a temporary */ /* date variable to sort by that will have no unknown values. The assigned value will just */ /* need to preserve the order and make the < 60s come before 60+ when tied or unknown order. */ /* To accomplish this, when there is unknown month or day for a tumor, we will assign the */ /* earliest possible date to keep the sequence # order for < 60, and the latest date for 60+ */ /* The earliest and latest are based on other tumors of that type for the patient. */ /* E.g. sequence # 1 - dx 99/99/2000 - using 99 in example, but would really be blank */ /* sequence # 2 - dx 4/1/2000, */ /* sequence # 60 - dx 99/99/2000 */ /* temp date for sequence # 1 = 1/1/2000 - need it to come before sequence # 2 */ /* temp date for sequence # 3 = 12/31/2000 - no other 60+ tumors in 2000, so use 12/31 */ /* sort fed so we can assign min possible date based on prior tumor */ proc sort data = fed; by state pat_id seq_num; run; /* sort nonfed with descending seq_num so we can assign max possible date based on later tumor */ proc sort data = nonfed; by state pat_id DESCENDING seq_num; run; /* This data set is assigning temporary dates to any missing dates such that sorting by date */ /* would preserve sequence number sort. This is working with federally reportable tumors. */ data fed; set fed; by state pat_id; retain tmp_year tmp_month tmp_day; if first.pat_id then do; tmp_year = year_dx; tmp_month = month_dx; tmp_day = day_dx; if tmp_month = . then tmp_month = 1; if tmp_day = . then tmp_day = 1; end; else do; /* not the first tumor for the person */ if month_dx = . then do; if year_dx ^= tmp_year then do; /* this tumor has dx year different than prior tumor */ tmp_month = 1; tmp_day = 1; end; /* if year_dx = tmp_year - then it is the same as prior record, so keep tmp_month and */ /* tmp_day from prior tumor */ end; else if day_dx = . then do; if year_dx ^= tmp_year or month_dx ^= tmp_month then do; tmp_month = month_dx; tmp_day = 1; end; /* if year_dx = tmp_year and month_dx = tmp_month - then it is the same as prior record */ /* so keep tmp_month and day from prior */ end; else do; /* no missing components */ tmp_month = month_dx; tmp_day = day_dx; end; tmp_year = year_dx; /* this will be used for next record for patient */ end; run; /* This data set is assigning temporary dates to any missing dates such that sorting by date */ /* would preserve sequence number sort. This is working with NON-federally reportable tumors. */ data nonfed; set nonfed; by state pat_id; retain tmp_year tmp_month tmp_day; if first.pat_id then do; tmp_year = year_dx; tmp_month = month_dx; tmp_day = day_dx; if tmp_month = . then tmp_month = 12; if tmp_day = . then tmp_day = 31; end; else do; /* not the first tumor for the person */ if month_dx = . then do; if year_dx ^= tmp_year then do; /* this tumor has dx year different than prior tumor */ tmp_month = 12; tmp_day = 31; end; /* if year_dx = tmp_year - then it is the same as prior record, so keep tmp_month and */ /* tmp_day from prior tumor */ end; else if day_dx = . then do; if year_dx ^= tmp_year or month_dx ^= tmp_month then do; tmp_month = month_dx; tmp_day = 31; /* could be 2/31, but only used for sort so it won't be a problem */ end; /* if year_dx = tmp_year and month_dx = tmp_month - then it is the same as prior record */ /* so keep tmp_month and day from prior */ end; else do; /* no missing components */ tmp_month = month_dx; tmp_day = day_dx; end; tmp_year = year_dx; /* this will be used for next record for patient */ end; run; data all; set fed nonfed; run; /* proc freq data = all; tables month_dx day_dx month_lc day_lc; title "All records pre-fix"; run; */ proc sort data = all; by state pat_id tmp_year tmp_month tmp_day seq_num; run; /* Calc fields without using presumed alive. */ /* Prior to calculating survival time, we need to assign non-missing values to */ /* all date comments that are missing (month or day of diagnosis or last contact). */ /* Code written to handle up to 50 dates (49 diagnoses and date of last contact). */ /* This data step sets up arrays of years, months, days, and missing value flags. */ /* At the end of the data step, the last record for a patient will have dates */ /* from all diagnoses and the date of last contact filled into the arrays and */ /* all missing components will be assigned non-missing values based on the */ /* following algorithm. If any dates have missing day, but known month, assign day. */ /* Day is assigned to middle of "possible" time window. If no other dates are in */ /* same month, then the middle of the month is selected, if other dates are in the */ /* same month, day is place in the middle of possible time period. Middle is */ /* calculated as floor((earliest possible day + latest possible day)/2) */ /* Then it makes a second pass, assigning value to month and day when month is missing. */ /* The same method of picking the middle of the time window is used. If multiple days */ /* are missing in the same month or months are missing in the same year, the earliest */ /* missing value is resolved first. E.g. dx = 12/99/2004, lc = 12/99/2004, assign */ /* day dx = 16 (floor((1+31)/2)), then assign day lc = 23 (floor((16+31)/2)). */ data all; set all; by state pat_id; retain record_order missing1-missing50 year1-year50 month1-month50 day1-day50; array missings(50) missing1-missing50; array years(50) year1-year50; array months(50) month1-month50; array days(50) day1-day50; if first.pat_id then do; record_order = 0; do i = 1 to 50; missings(i) = .; years(i) = .; months(i) = .; days(i) = .; end; end; record_order = record_order + 1; /* first record will be 1 */ years(record_order) = year_dx; if year_dx = . then years(record_order) = 9999; /* set to unrealistically high value, so we get */ /* negative survival time - all will be 9 filled */ months(record_order) = month_dx; days(record_order) = day_dx; if month_dx = . or day_dx = . then missing_dx = 1; else missing_dx = 0; missings(record_order) = missing_dx; if last.pat_id then do; numrecs = record_order; years(numrecs+1) = year_lc; if year_lc = . then years(numrecs+1) = 1900; /* set to unrealistically low value, so we get */ /* negative survival time - all will be 9 filled */ months(numrecs+1) = month_lc; days(numrecs+1) = day_lc; if month_lc = . or day_lc = . then missing_lc = 1; else missing_lc = 0; missings(numrecs+1) = missing_lc; /* pass 1, fix any missing days when month is known */ day_start_constraint = 1; do i = 1 to numrecs+1; if months(i) ^= . then do; if months(i) in (1,3,5,7,8,10,12) then number_days_in_month = 31; else if months(i) in (4,6,9,11) then number_days_in_month = 30; else do; /* Feb - get last day of Feb in current year by looking at day before March 1 */ number_days_in_month = day(mdy(3,1,years(i)) - 1); end; end; day_end_constraint = number_days_in_month; bdone = 0; j = i+1; if i > 1 then do; if years(i) = years(i-1) and months(i) = months(i-1) then day_start_constraint = days(i-1); else day_start_constraint = 1; end; if months(i) ^= . and days(i) = . then do; /* missing day but not month */ do until (bdone = 1); if years(i) ^= years(j) or months(i) ^= months(j) then bdone = 1; else if days(j) ^= . then do; day_end_constraint = days(j); bdone = 1; end; if j = numrecs + 1 then bdone = 1; j = j+1; end; /* end do until */ days(i) = floor((day_start_constraint + day_end_constraint)/2); end; end; /* pass 2, fix any missing months (and days) - all dates with known month will now have complete date */ day_start_constraint = 1; month_start_constraint = 1; do i = 1 to numrecs+1; day_end_constraint = 31; month_end_constraint = 12; bdone = 0; j = i+1; if i > 1 then do; if years(i) = years(i-1) then do; day_start_constraint = days(i-1); month_start_constraint = months(i-1); end; else do; day_start_constraint = 1; month_start_constraint = 1; end; end; if months(i) = . then do; do until (bdone = 1); if years(i) ^= years(j) then bdone = 1; else if months(j) ^= . then do; day_end_constraint = days(j); month_end_constraint = months(j); bdone = 1; end; if j = numrecs + 1 then bdone = 1; j = j+1; end; /* end do until */ tempstart = mdy(month_start_constraint, day_start_constraint, years(i)); tempend = mdy(month_end_constraint, day_end_constraint, years(i)); newdate = floor((tempstart+tempend)/2); months(i) = month(newdate); days(i) = day(newdate); end; end; end; run; /* Sort such that patients records are reversed. This is because the last record for the */ /* patient has all of the corrected information. */ proc sort data = all; by state pat_id DESCENDING record_order; run; /* This data set retains information from last record for patient (with all fixed dates) */ /* and assigns the fixed dates to the appropriate tumor record. Then calculates the survival */ /* months and flag for the NON presumed alive version of the fields. Survival months is */ /* calculated as (date of last contact-date of dx)/DaysInAMonth. DaysInAMonth = 365.24/12. */ data all; set all; by state pat_id; retain index_lc missing1-missing50 year1-year50 month1-month50 day1-day50 n_missing1-n_missing50 n_year1-n_year50 n_month1-n_month50 n_day1-n_day50 year_lc month_lc day_lc; drop missing1-missing50 year1-year50 month1-month50 day1-day50 n_missing1-n_missing50 n_year1-n_year50 n_month1-n_month50 n_day1-n_day50; array missings(50) missing1-missing50; array years(50) year1-year50; array months(50) month1-month50; array days(50) day1-day50; array n_missings(50) n_missing1-n_missing50; array n_years(50) n_year1-n_year50; array n_months(50) n_month1-n_month50; array n_days(50) n_day1-n_day50; if first.pat_id then do; index_lc = numrecs + 1; do i = 1 to numrecs+1; n_missings(i) = missings(i); n_years(i) = years(i); n_months(i) = months(i); n_days(i) = days(i); end; end; missing_dx = n_missings(record_order); year_dx = n_years(record_order); month_dx = n_months(record_order); day_dx = n_days(record_order); missing_lc = n_missings(index_lc); year_lc = n_years(index_lc); month_lc = n_months(index_lc); day_lc = n_days(index_lc); surv_days = mdy(month_lc, day_lc, year_lc) - mdy(month_dx, day_dx, year_dx); surv_mon = floor(surv_days/&DAYS_IN_MONTH); if missing_dx = 1 or missing_lc = 1 then do; if year_dx = year_lc and (o_month_dx = o_month_lc or o_month_dx = . or o_month_lc = .) then do; surv_flag = 2; /* some unknown - could be 0 days */ end; else surv_flag = 3; /* some unknown - can't be 0 days */ end; else do; /* no missing values */ if surv_days = 0 then surv_flag = 0; /* complete dates, 0 days */ else surv_flag = 1; /* complete dates, not 0 days */ end; run; /* recalc fields using presumed alive */ proc sort data = all; by state pat_id record_order; run; /* Next block of code is identical to prior, except all alive patients have date of last contact */ /* set to study cut-off date. First step is to assign the last contact for alive and then reset */ /* reset all missing values to missing. Then the logic is the same as above. */ data all; set all; by state pat_id; retain record_order missing1-missing50 year1-year50 month1-month50 day1-day50; array missings(50) missing1-missing50; array years(50) year1-year50; array months(50) month1-month50; array days(50) day1-day50; if first.pat_id then do; do i = 1 to 50; missings(i) = .; years(i) = .; months(i) = .; days(i) = .; end; end; if vit_stat = 1 then do; year_lc = &STUDYCUTOFFYEAR; month_lc = &STUDYCUTOFFMONTH; day_lc = &STUDYCUTOFFDAY; end; else do; month_lc = o_month_lc; day_lc = o_day_lc; end; month_dx = o_month_dx; day_dx = o_day_dx; years(record_order) = year_dx; if year_dx = . then years(record_order) = 9999; /* set to unrealistically high value, so we get */ /* negative survival time - all will be 9 filled */ months(record_order) = month_dx; days(record_order) = day_dx; if month_dx = . or day_dx = . then missing_dx = 1; else missing_dx = 0; missings(record_order) = missing_dx; if last.pat_id then do; numrecs = record_order; years(numrecs+1) = year_lc; if year_lc = . then years(numrecs+1) = 1900; /* set to unrealistically low value, so we get */ /* negative survival time - all will be 9 filled */ months(numrecs+1) = month_lc; days(numrecs+1) = day_lc; if month_lc = . or day_lc = . then missing_lc = 1; else missing_lc = 0; missings(numrecs+1) = missing_lc; /* pass 1, fix any missing days when month is known */ day_start_constraint = 1; do i = 1 to numrecs+1; if months(i) ^= . then do; if months(i) in (1,3,5,7,8,10,12) then number_days_in_month = 31; else if months(i) in (4,6,9,11) then number_days_in_month = 30; else do; /* Feb - get last day of Feb in current year by looking at day before March 1 */ number_days_in_month = day(mdy(3,1,years(i)) - 1); end; end; day_end_constraint = number_days_in_month; bdone = 0; j = i+1; if i > 1 then do; if years(i) = years(i-1) and months(i) = months(i-1) then day_start_constraint = days(i-1); else day_start_constraint = 1; end; if months(i) ^= . and days(i) = . then do; /* missing day but not month */ do until (bdone = 1); if years(i) ^= years(j) or months(i) ^= months(j) then bdone = 1; else if days(j) ^= . then do; day_end_constraint = days(j); bdone = 1; end; if j = numrecs + 1 then bdone = 1; j = j+1; end; /* end do until */ days(i) = floor((day_start_constraint + day_end_constraint)/2); end; end; /* pass 2, fix any missing months (and days) - all dates with known month will now have complete date */ day_start_constraint = 1; month_start_constraint = 1; do i = 1 to numrecs+1; day_end_constraint = 31; month_end_constraint = 12; bdone = 0; j = i+1; if i > 1 then do; if years(i) = years(i-1) then do; day_start_constraint = days(i-1); month_start_constraint = months(i-1); end; else do; day_start_constraint = 1; month_start_constraint = 1; end; end; if months(i) = . then do; do until (bdone = 1); if years(i) ^= years(j) then bdone = 1; else if months(j) ^= . then do; day_end_constraint = days(j); month_end_constraint = months(j); bdone = 1; end; if j = numrecs + 1 then bdone = 1; j = j+1; end; /* end do until */ tempstart = mdy(month_start_constraint, day_start_constraint, years(i)); tempend = mdy(month_end_constraint, day_end_constraint, years(i)); newdate = floor((tempstart+tempend)/2); months(i) = month(newdate); days(i) = day(newdate); end; end; end; run; proc sort data = all; by state pat_id DESCENDING record_order; run; data all; set all; by state pat_id; retain index_lc missing1-missing50 year1-year50 month1-month50 day1-day50 n_missing1-n_missing50 n_year1-n_year50 n_month1-n_month50 n_day1-n_day50 year_lc month_lc day_lc; drop missing1-missing50 year1-year50 month1-month50 day1-day50 n_missing1-n_missing50 n_year1-n_year50 n_month1-n_month50 n_day1-n_day50; array missings(50) missing1-missing50; array years(50) year1-year50; array months(50) month1-month50; array days(50) day1-day50; array n_missings(50) n_missing1-n_missing50; array n_years(50) n_year1-n_year50; array n_months(50) n_month1-n_month50; array n_days(50) n_day1-n_day50; if first.pat_id then do; index_lc = numrecs + 1; do i = 1 to numrecs+1; n_missings(i) = missings(i); n_years(i) = years(i); n_months(i) = months(i); n_days(i) = days(i); end; end; missing_dx = n_missings(record_order); year_dx = n_years(record_order); month_dx = n_months(record_order); day_dx = n_days(record_order); missing_lc = n_missings(index_lc); year_lc = n_years(index_lc); month_lc = n_months(index_lc); day_lc = n_days(index_lc); pa_surv_days = mdy(month_lc, day_lc, year_lc) - mdy(month_dx, day_dx, year_dx); pa_surv_mon = floor(pa_surv_days/&DAYS_IN_MONTH); if missing_dx = 1 or missing_lc = 1 then do; if year_dx = year_lc and vit_stat ^= 1 and (o_month_dx = o_month_lc or o_month_dx = . or o_month_lc = .) then pa_surv_flag = 2; /* some unknown - could be 0 days */ else if year_dx = year_lc and vit_stat = 1 and (o_month_dx = month_lc or o_month_dx = .) then pa_surv_flag = 2; /* some unknown - could be 0 days - don't check orig lc date for alive */ else pa_surv_flag = 3; /* some unknown - can't be 0 days */ end; else do; if pa_surv_days = 0 then pa_surv_flag = 0; /* complete dates, 0 days */ else pa_surv_flag = 1; /* complete dates, not 0 days */ end; run; proc sort data = all; by state pat_id DESCENDING record_order; run; /* Fix issue where person could have one dx with some missing coded as "could be 0 days" followed by */ /* a tumor that could not be zero days (with or without some missing) - therefore the earlier tumor */ /* can't be 0 days. */ data all; set all; by state pat_id; retain bAny1or3Flags bAny1or3PAFlags; if first.pat_id then do; bAny1or3Flags = 0; bAny1or3PAFlags = 0; end; if surv_flag in(1,3) then bAny1or3Flags = 1; if pa_surv_flag in(1,3) then bAny1or3PAFlags = 1; if surv_flag = 2 and bAny1or3Flags = 1 then surv_flag = 3; if pa_surv_flag = 2 and bAny1or3PAFlags = 1 then pa_surv_flag = 3; run; /* sort by original input order - as read from file */ proc sort data = all; by order; run; /* For inspecting assignment of values for missings */ /* data single mult; set all; if numrecs = 1 then output single; else output mult; run; proc freq data = all; tables month_dx day_dx month_lc day_lc; title "all records - post fix"; run; proc print data = single; where (year_dx = year_lc) and ((o_month_lc = . or o_month_dx = .) or (month_lc = month_dx)); var month_dx day_dx year_dx month_lc day_lc year_lc o_year_lc o_month_dx o_month_lc surv_days surv_mon surv_flag pa_surv_days pa_surv_mon pa_surv_flag missing_dx missing_lc; title "single record - where year dx = year lc or either month was missing"; run; proc print data = mult; var pat_id record_order month_dx day_dx year_dx month_lc day_lc year_lc o_year_lc o_month_dx o_month_lc surv_days surv_mon surv_flag pa_surv_mon pa_surv_flag; title "multiple records"; run; */ data _null_; merge all fullrec; by order; file out lrecl=3339 pad; /* put out full record and 4 new fields */ if year_dx > &STUDYCUTOFFYEAR then do; surv_mon = 9999; surv_flag = 9; pa_surv_mon = 9999; pa_surv_flag = 9; end; if surv_mon < 0 or surv_mon = . then do; surv_mon = 9999; surv_flag = 9; end; if pa_surv_mon < 0 or pa_surv_mon = . then do; pa_surv_mon = 9999; pa_surv_flag = 9; end; put @ 1 record $char3339. @2593 surv_mon z4. @2597 surv_flag 1. @2598 pa_surv_mon z4. @2602 pa_surv_flag 1. ; run;