/* ex6_13.do example 6.13 ROC GLM model for Etzioni longitudinal PSA data (from h:\d\scrmeth\rocregr\psadata\rocrgp4c.do) last modification: 01 May 2003 */ version 7 set more off cap semt_profile cap log close clear set mem 40m ************************************* * program to fit an ROC-GLM model on a single bootstrap sample: capture program drop glmfit program define glmfit version 7 if "`1'"=="?" { global S_1 "alpha0 alpha1 beta1 beta2 beta3 beta4" exit } drop id egen int ndb = sum(d==0), by(type) *********** /* steps 1 & 2 : calculate So_hat = the empirical survivor fxn of the residuals from the linear model for age. Models estimated separatetly for each test type) */ /* a) fit the linear model wrt age for each test type and get residuals */ reg y age if type==0 & d==0 predict epsilon if type==0, resid /* include out-of-sample d==1 in residual calc. */ reg y age if type==1 & d==0 predict eps1 if type==1, resid replace epsilon = eps1 if type==1 drop eps1 /* b) calculate the empirical survivor distribution for the residuals: and, simultaneously, the corresponding placement values for the diseased obs */ gsort -epsilon, gen(negsort) qui bys type (negsort): gen sumdb = sum(d==0) qui bys type y (sumdb): gen t = sumdb[_N] qui replace t = t/ndb gen plval = t if d==1 drop sumdb negsort *********** /* STEP 3a : restructure (expand) data to include nt observations (no. of distinct t's) for each diseased observation, within each covariate level. */ drop if t==1 & d==0 bys d type t: drop if d==0 & _n > 1 /* restrict to non-diseased records with distinct fpr's */ egen nt = sum(d==0), by(type) gen uid = _n /* unique id for each record */ quietly bys d type (y): gen id_t =_n if d==0 /* id_t: unique id each t within test type */ expand nt if d==1 bys uid: replace id_t = _n if d==1 /* assigns a set of t's to each dis. obs. */ bys type id_t (d): replace t = t[1] drop if d==0 * interaction term: gen time_type = time*type *********** /* STEP 3b : calculate the binary placement value indicators */ gen byte u_it = (plval <= t) *********** /* STEP 4 : calculate h_s(t) */ gen phiinvt = invnorm(t) *********** /* STEP 5 : fit the GLM model */ probit u_it phiinvt type time time_type age post `1' (_b[_cons]) (_b[phiinvt]) (_b[type]) (_b[time]) (_b[time_type]) (_b[age]) end ************************************************************* /* SET UP DATA */ use ${semt_data}psa2b,clear gen rid = _n /* record id */ /* generate a separate record for each test type: */ expand 2 qui bys rid: gen byte type = _n - 1 qui bys type: gen y = -ln(fpsa/tpsa) if type == 0 qui bys type: replace y = ln(tpsa) if type==1 la var type "test type" la def typelbl 0 "-ln(f/t psa)" 1 "ln(total psa)" /* time to diagnosis is a disease-specific covariate. will model it as a positive measure (i.e. t_dx - t_sample ) */ assert t < 0 if d==1 gen time = -t if d==1 * center age at 50 yrs replace age = age - 50 keep id y d type age time compress _all set seed 156385 log using ${semt_log}ex6_13, replace set rmsg on bstrap glmfit, cluster(id) reps(5000) saving(${semt_log}ex6_13) replace set rmsg off qui log close