%macro adaboost6h(boost=yes, /* Controls whether boosting is performed or not. The */ /* default is to perform boosting (BOOST=YES). However, */ /* the program performs various different kinds of data */ /* manipulations, including replicate reduction, */ /* filtering, and scoring of input datasets with the */ /* linear predictor. If BOOST=NO is specified, then the */ /* data manipulations may be performed without actually */ /* performing the boosting. */ /* Input datasets */ case=case, /* Input dataset of peak values for case observations */ control=control, /* Input dataset of peak values for control observations */ test=, /* Name of input dataset containing test data for scoring */ /* Data restrictions */ V1_lb=0, /* Lower bound on Daltons for current analysis */ V1_ub=1E7, /* Upper bound on Daltons for current analysis */ /* Data manipulations prior to execution of boosting process */ rep_spectra=Yes, /* Does the data contain replicate spectra for some samples*/ rep_reduction=Yes, /* Reduce replicated sample spectra peak values to mean */ reduc_method=median, /* Statistic to compute for replicate reduction */ rep_keyfile=, /* Name of file which contains information about which */ /* spectra belong to a common sample */ rep_sample_name=samp_ID, /* Name of variable on REP_KEYFILE which identifies master */ /* sample IDs */ rep_spectra_name=spec_ID, /* Name of variable on REP_KEYFILE which identifies slave */ /* (spectrum) IDs belonging to a master sample ID */ filter=No, /* Use only peaks which have best marginal performance */ /* by AUC and partial AUC criteria */ cv_fpr=0.2, /* Critical value for false positive rate partial AUC */ cv_tpr=0.7, /* Critical value for true positive rate partial AUC */ auc_pct=10, /* Top percentage of total AUC criteria included by filter */ auc1_pct=10, /* Top percentage of partial AUC criteria defined on FPR */ /* included by filter */ auc2_pct=10, /* Top percentage of partial AUC criteria defined on TPR */ /* included by filter */ /* Boosting method control */ method=discrete, /* Perform Real or Discrete ADABoost */ iters=0, /* Number of iterations to perform */ criterion=AUC, /* Criterion for selecting best variable at m-th iteration */ /* Criteria for method=discrete are MIN_ERR, AUC, */ /* PARTIAL_AUC, and SENSSPEC. Criteria for method=real */ /* are AUC and LOG_LIKE. */ partial_criterion=SPEC, /* If criterion=PARTIAL_AUC, then define area in terms of */ /* SENSitivity or SPECificity. */ partial_critval=.2, /* Cutoff value for PARTIAL_CRITERION */ sens_wt=1, /* Sensitivity weight */ spec_wt=1, /* Specificity weight */ wgt_anal=YES, /* Initialize weights for cases and controls to represent */ /* equal numbers of records */ /* created datasets available for output */ linpred=linpred, /* Output dataset which allows construction of linear */ /* predictor in new data. Note that if BOOST=NO is */ /* specified, then the dataset named to macro variable */ /* LINPRED will be employed as an input dataset to be used */ /* for scoring case, control, and test datasets. */ case_out=, /* Name of output dataset containing case data just prior */ /* to invocation of the boosting procedure. If there has */ /* been a lot of data preparation involved (replicate */ /* reduction, filtering) which you would like to not have */ /* to repeat, then you may avoid repeating the work */ /* necessary for creating the data by outputting the data */ /* just prior to the boosting process iterations. */ control_out=, /* Name of output dataset containing control data just */ /* prior to invocation of the boosting procedure. */ test_out=, /* Name of output dataset containing test data just prior */ /* to test data scoring. */ case_score=, /* Result of linear predictor applied to case data */ control_score=, /* Result of linear predictor applied to control data */ test_score=, /* Result of linear predictor applied to test data */ cverr_out=, /* Observations in error in the cross-validation set at */ /* the minimum cross-validation error. Also included are */ /* the case/control status and the weights used to define */ /* cross-validation subsets. */ cvweights=, /* Output initial weight matrix for cross-validation step. */ /* Note that the weights are structured with weights for */ /* control observations presented first followed by */ /* weights for cases. */ /* Crossvalidation */ cv=yes, /* Examine cross-validation error rate and determine */ /* boosting stopping rule from CV */ seed=0, /* Seed randomly allocates 10% of cases and controls to */ /* each of 10 crossvalidation sets */ /* Print control */ print=yes /* If set to yes, then print distributions of AUC and */ /* partial AUC statistics for the complete case/control */ /* data set. Also print the linear predictor and training */ /* and cross-validation error rate statistics for each */ /* boosting iteration. */ ); /**************************************************************************************/ /* */ /* Macro ADAboost implements both real (boosting logistic regression) and discrete */ /* (boosting decision tree) boosting in a single macro. See papers by Yinsheng Qu */ /* (discrete boosting) Yutaka Yasui (real boosting) to understand methods and */ /* applications. */ /* */ /* */ /* DATA CONSTRUCTION ISSUES: */ /* Two input datasets are required, one input dataset containing observations for */ /* cases and the second input dataset containing observations for controls. These */ /* data sets follow a different construction than data which are typically employed */ /* in that rows represent variables and columns represent observations (with one */ /* exception as indicated below.) Because rows represent variables, the case and */ /* control data sets must have the same number of rows. However, the number of */ /* columns in the case and control data sets may differ according to the number of */ /* case and control records. */ /* */ /* As indicated above, columns of the case and control data sets generally represent */ /* observations. However, the first column is used to indicate the predictor */ /* variable. Because this program was developed for a problem in which peak */ /* intensities from mass-spectroscopy instruments are used to differentiate case */ /* and control records, the first column is assumed to indicate a position on an */ /* x-axis. Thus, the first column is assumed (required) to be numeric. */ /* */ /* Thus, the data structure for case data must be contructed as: */ /* */ /* predictor case record */ /* variable -------------------------------------------- */ /* indicator 1 2 3 ... n1 */ /* ________________________________________________________ */ /* 1 x1_1 x1_2 x1_3 ... x1_n1 */ /* 2 x2_1 x2_2 x2_3 ... x2_n1 */ /* 3 x3_1 x3_2 x3_3 ... x3_n1 */ /* ... ... ... ... ... ... */ /* k xk_1 xk_2 xk_3 ... xk_n1 */ /* */ /* */ /* Note that the predictor variable indicator must be row-specific. It is not */ /* allowed to have multiple rows which have the same value of the predictor variable */ /* indicator. However, it is not necessary that the predictor variable indicator be */ /* integer valued. Any unique real-value can be employed as the predictor variable */ /* indicator. */ /* */ /* Control data are constructed in the same way. However, the number of variables */ /* in the control data set can differ from the number of variables in the case data */ /* because variables (with the exception of the first variable in case and control */ /* data sets) represent observations. */ /* */ /* In addition to case and control data, you may input a test dataset which may be */ /* a mixture of cases and controls. The test dataset should be constructed just as */ /* the case and control data are constructed. The test dataset can be scored by the */ /* linear predictor. You may use the scored test dataset to make predictions about */ /* case/control status in the test data. */ /* */ /* Note that it is critical to the use of this macro to use only records with */ /* complete information be named. If there are missing values for any of your data, */ /* the macro will fail. Deletion of missing data can be handled either by dropping */ /* rows in which missing values occur or columns in which missing values occur. */ /* Dropping of rows in which missing values occur would be appropriate if a large */ /* number of spectra all have missing values for some single predictor value. */ /* Dropping columns which have missing values would be recommended if there are */ /* observations which have a large number of missing values. If the number of */ /* missing values is small, one might consider an imputation strategy so that */ /* complete data might be presented to the boosting macro. */ /* */ /* */ /* REPLICATE REDUCTION: */ /* In some studies, there may be replicate observations (columns of data) for some */ /* samples. If you would like to reduce the replicate samples to a single median */ /* or mean value, then there must exist a replicate key file which has at least two */ /* fields of information. One field names spectra in your data. The other field */ /* names samples in your data. When replicate reduction is specified, a median or */ /* mean is computed over all spectra which have the same sample identification. */ /* */ /* By default, the sample median value is employed to represent a sample spectrum */ /* value. The user can choose to use a sample mean value instead. When there are */ /* only two replicates per spectrum, the sample median and sample mean values are */ /* identical. */ /* */ /* If replicate reduction is not specified but a replicate key file is named and if */ /* cross-validation error rates are generated and reported, then all spectra from a */ /* single sample are randomized to the same cross-validation subset. By randomizing */ /* replicate spectra to the same cross-validation subset, we avoid optimism in the */ /* cross-validation error rate which would arise from using as in the evaluation */ /* data which are correlated with data which were used to construct the classifier. */ /* */ /* */ /* */ /* ADABOOST METHOD SPECIFICATION: */ /* We previously mentioned that this macro implements both Real and Discrete */ /* boosting methods. Discrete boosting is generally faster than Real boosting. */ /* Real boosting requires fitting logistic regression for each row of the input */ /* data. Because logistic regression is iterative, this can take some time. */ /* Discrete boosting, as implemented here, requires that the data be sorted on the */ /* row values each time a new row is selected. After this sort, we can compute all */ /* statistics necessary for the discrete boosting with minimal cost. We update our */ /* criterion values based on whether the i-th column is a case or control. */ /* Because Discrete boosting is more computationally efficient, we have made that */ /* the default. Real boosting can be selected by specifying */ /* */ /* method=real */ /* */ /* If Discrete boosting is selected, there a number of additional macro variables */ /* which exert program control. The first of these is macro variable CRITERION. */ /* Macro variable CRITERION can take on values MIN_ERR, AUC, SENSSPEC, or */ /* PARTIAL_AUC. MIN_ERR minimizes a total incorrect error condition (maximizes the */ /* total correct decisions). AUC selects at each boosting step the variable which */ /* maximizes area under the ROC curve. SENSSPEC maximizes the (weighted) sensitivity*/ /* and specificity function. (Note that if weights for both sensitivity and */ /* specificity are both 1 and if there are an equal number of true positive and true */ /* negative cases, then the SENSSPEC criterion is identical to the MIN_ERR */ /* criterion.) Finally, method=PARTIAL_AUC returns the element which maximizes a */ /* partial area under the ROC curve. The area can be defined in terms of either */ /* specificity (PARTIAL_CRITERION=SPEC) in which case the area under the curve from */ /* 1-specificity=0 (specificity=1) to 1-specificity=PARTIAL_CRITVAL is obtained, or */ /* in terms of sensitivity (PARTIAL_CRITERION=SENS) in which case the area under the */ /* curve from the specificity at which 1-sensitivity achieves the value */ /* PARTIAL_CRITVAL through 1-sensitivity=0 is obtained. These areas are illustrated */ /* below: */ /* */ /* PARTIAL_CRITERION=SPEC finds the variable which maximizes the shaded area shown */ /* in the following figure: */ /* */ /* SENS 1.0 | | |||||||| */ /* | | ||||||||||||||| */ /* | | |||||||||||||||||||||| */ /* | | ||||||||||| */ /* | |||| */ /* | ||| */ /* | |XX| */ /* | |XXX| */ /* | |XXXX| */ /* | |XXXXX| */ /* | |XXXXX| */ /* | |XXXXXX| */ /* | |XXXXXX| */ /* | |XXXXXX| */ /* | |XXXXXX| */ /* | |XXXXXX| */ /* | |XXXXXXX| */ /* | |XXXXXXX| */ /* | |XXXXXXX| */ /* | |XXXXXXXX| */ /* | |XXXXXXXX| */ /* ||XXXXXXXXX| */ /* 0.0 -----------|----------------------------------------------------------- */ /* 0 cv 1 */ /* 1-SPECIFICITY */ /* */ /* */ /* Alternatively, specifying PARTIAL_CRITERION=SENS finds the variable which */ /* maximizes the shaded area shown in the following figure: */ /* */ /* SENS 1.0 | |||||||| */ /* | |||||||||||||||xxxxxxxx */ /* | ||||||||||||||||||||||xxxxxxxxxxxxxxxxxxxxxxx */ /* | |||||||||||xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | ||||xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* cv |--------||xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | ||xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* | | |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* || |xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx */ /* 0.0 ----------------------------------------------------------------------- */ /* 0 1 */ /* 1-SPECIFICITY */ /* */ /* */ /* Note that this is the area under the curve from the specificity at which a */ /* given level of sensitivity is achieved through specificity of 0. The critical */ /* value for the PARTIAL_CRITERION=SENS is defined in terms of the sensitivity level */ /* which must be achieved before we start accumulating the area. */ /* */ /* If one specifies AUC, PARTIAL_AUC, or SENSSPEC as the maximization function, then */ /* the function */ /* */ /* sensitivity*SENS_WT + specificity*SPEC_WT */ /* */ /* is used to determine the cutpoint for the decision rule. */ /* */ /* */ /* CUTPOINT SELECTION: DISCRETE BOOSTING */ /* With discrete boosting employing the MIN_ERR function criterion, the value of the */ /* predictor which maximizes correct predictions is selected as the cutpoint. If */ /* one of the AUC criteria is selected instead of the MIN_ERR criterion or if the */ /* criterion is specified to be SENSSPEC, then a weighted sum of sensitivity and */ /* specificity is employed. The default is to give equal weight to both sensitivity */ /* and specificity so that the predictor value which maximizes the sum of both */ /* sensitivity and specificity is selected. When there are equal numbers of cases */ /* and controls, then the cutpoint which maximizes sensitivity + specificity */ /* (hereafter referred to as the unweighted sum) is the same value which maximizes */ /* total correct predictions. However, when the number of cases and controls are */ /* different, the cutpoint which maximizes the unweighted sum may be different from */ /* the cutpoint which maximizes the number of correct predictions. It should be */ /* noted that the minimum error criterion is integer valued, and there may be */ /* multiple values of the predictor variable which minimize prediction errors */ /* (maximize correct predictions). With unequal numbers of cases and controls, the */ /* unweighted sum is generally not integer valued and is expected to have fewer ties */ /* for the best cutpoint. In this respect, the unweighted sum is preferable to a */ /* minimum error criterion. */ /* */ /* However, it should be observed that when the number of cases and control are not */ /* balanced, the cutpoint which maximizes the unweighted sum may result in a */ /* prediction error rate which is larger than 0.5. That is, a cutpoint may be */ /* selected which yields more incorrect decisions than correct decisions. If this */ /* occurs, then the boosting process cannot progress. Whichever variable maximized */ /* AUC (or partial AUC or SENSSPEC) will be picked up as the best variable for the */ /* next boosting iteration and all subsequent iterations. If the cutpoint is */ /* selected as the value of the predictor which results in the most correct */ /* predictions (minimizes MIN_ERR), then the error rate will always be <=0.5 and */ /* updated weights will allow a different variable to be selected as the best */ /* predictor for the next stump. */ /* */ /* If you employ method=DISCRETE along with criterion=(AUC, PARTIAL_AUC, SENSSPEC) */ /* and observe that the term ALPHA (provided in the SAS list file for each boosting */ /* iteration) becomes negative such that the boosting process quits updating the */ /* linear predictor, then you may wish to supply weights other than the default */ /* values of 1 for both sensitivity and specificity. If we have cutpoint */ /* */ /* cutpoint = x*I[x](max(sens_wt*sensitivity + spec_wt*specificity)) */ /* = x*I[x](max(sens_wt*(correct case predictions/total cases) + */ /* spec_wt*(correct control precict/total controls) */ /* */ /* indicating the value of x which maximizes the specified weighted sum, and the */ /* weights are constructed as sens_wt=(total cases) and spec_wt=(total controls), */ /* then the cutpoint which is selected will maximize the total correct predictions. */ /* Case and control frequencies are available as variables NCASE and NCONTROL. */ /* For the first iteration, these case and control frequencies will guarantee that */ /* the selected cutpoint has an error rate <=0.5. After the first iteration */ /* In this case, the error rate will always be <=0.5 and the boosting process will */ /* only stall if the error rate is exactly 0.5. Now, if you know the number of */ /* cases and controls beforehand, you can specify those values for sens_wt and */ /* spec_wt, respectively. However, if you don't know the number of cases and */ /* controls beforehand, then you can employ variables */ /* */ /* */ /* */ /* FILTERING: */ /* If there are a large number of potential peaks, boosting could take a very long */ /* time. The time that it takes to perform the boosting process can be reduced by */ /* filtering the data to a set of peaks which have initial optimal properties. */ /* optimality is determined by area under the curve and partial areas under the */ /* curve for each of the two regions as described above. By default, the critical */ /* value for the area defined in terms of 1-specificity is 0.2, while the critical */ /* value for the area defined in terms of minimum sensitivity is 0.7. Peaks which */ /* meet or exceed the 90th percentile on any one of these areas are selected for */ /* possible inclusion in the boosting model. All other peak values are dropped from */ /* possible consideration. */ /* */ /* */ /* */ /* CROSSVALIDATION: */ /* This macro employs crossvalidation to determine how many boosting iterations to */ /* perform. Ten sets of weights are initially constructed, with 10% of observations */ /* initially having weight 0 on the first column of weights, a different 10% having */ /* weight 0 on the second column of weights, etc., such that all observations are */ /* assigned weight 0 on one and only one weight column. Thus, we construct 10 */ /* crossvalidation sets in which 90% of the observations are used to construct the */ /* classifier and the remaining 10% are used to compute crossvalidation error rates. */ /* The crossvalidation error rate is computed across the 10 crossvalidation sets at */ /* each boosting iteration. When there is evidence that the crossvalidation error */ /* rate has been rising over the recent interval, then boosting is terminated. */ /* Among iterations which have the lowest crossvalidation error rate, the iteration */ /* which has the lowest training error rate is selected as the number of iterations */ /* to employ for the entire training dataset. Macro variable SEED is used to */ /* randomly allocate columns of the data to each of the 10 crossvalidation sets. */ /* In order for the boosting results to be replicable on the crossvalidation error */ /* criterion, it is necessary to assign a positive seed value, where seeds are */ /* integer valued having maximum (2**31) - 1. The default seed value (0) employs */ /* time of day at the invocation of the random number function to construct the */ /* seed stream. Since time of day is not replicable, it is strongly advised that */ /* the user specify their own seed value. */ /* */ /* */ /* */ /* DATA SET SCORING */ /* Data set scoring allows the user to construct the value of the linear predictor */ /* at the last boosting iteration for case and control data used to derive the */ /* linear predictor, as well as for a test dataset in which case/control status is */ /* to be predicted on the basis of the linear predictor. The scored case and */ /* control data are useful for identifying badly predicted case and control data for */ /* further investigation. A scored test dataset containing a mixture of cases and */ /* controls may be used to verify the predictive ability of the selected model */ /* against data not used in the construction of the linear predictor. Column names */ /* on the original input datasets (prefixed with M for mean or median if replicate */ /* reduction has been employed) become values of a variable named ID in the scored */ /* dataset. The value of the linear predictor is contained in a variable LinPred. */ /**************************************************************************************/ /*********************************************************************************************/ /*********************************************************************************************/ /* */ /* Utility macros which will be referenced several times over (with modifications coded */ /* through macro variable specifications) are listed first. */ /* */ /*********************************************************************************************/ /*********************************************************************************************/ /* Utility macro DS_VARS */ %macro ds_vars(indat=, ds_type=); /****************************************************************************/ /* Purpose: Contruct a new CASE, CONTROL, or TEST data set with replicate */ /* reduction applied. */ /* */ /* Inputs: */ /* INDAT Name of macro variable holding the name of the original CASE, */ /* CONTROL, or TEST data set */ /* DS_TYPE specifies whether the input data set named in INDAT is of type */ /* CASE, CONTROL, or TEST */ /* */ /* Ouptut: A data set named __CASE, __CONTROL, or __TEST as appropriate */ /****************************************************************************/ %global gfirstvar grep_spectra_name grep_sample_name greduc_method gDATA_OK; /* Note that the macro variable INDAT refers to a macro variable naming */ /* a data set. If we just resolve the value of INDAT, we return the */ /* name of the macro variable referred to. If we want to refer to the */ /* value of the macro variable that INDAT refers to, then we must */ /* resolve macro variable INDAT twice, the first time returning the */ /* name of a macro variable, the second time resolving the macro */ /* variable that we just found the name of. */ %let ds_type=%upcase(&ds_type); %if &ds_type=CASE %then %let OK_TEST=1; %else %if &ds_type=CONTROL %then %let OK_TEST=2; %else %if &ds_type=TEST %then %let OK_TEST=3; %let data_ok&OK_TEST=1; /* List names of variables in specified CASE, CONTROL, or TEST data set */ /* List is named &grep_spectra_name in output data set &ds_type._spectra */ proc contents data=&&indat(drop=&gfirstvar) out=&ds_type._spectra(keep=name rename=(name=&grep_spectra_name)) noprint; run; /* Sort the list by &grep_spectra_name */ proc sort data=&ds_type._spectra; by &grep_spectra_name; run; /* Merge data set with replicate key file by &grep_spectra_name */ data &ds_type._spectra; merge &ds_type._spectra(in=a) rep_keyfile(in=b); by &grep_spectra_name; if a; if ^b then do; /* Found spectra in INDAT data set which was not on rep_keyfile */ put &grep_spectra_name= " found in &DS_TYPE data set &INDAT but not coded in the replicate key file"; call symput("data_ok&OK_TEST", "0"); end; run; %let gdata_ok=%sysfunc(min(&gdata_ok, &&data_ok&OK_TEST)); %if &&data_ok&OK_TEST=0 %then %do; %put Spectra found in the &DS_TYPE data set &INDAT which are not coded in the replicate; %put key file &rep_keyfile. THE boosting macro will abort.; %end; %else %do; /* Construct the &ds_type data set with replicate reduction applied. */ /* First, sort by sample names so that we get all observations from */ /* each sample in sequence. */ proc sort data=&ds_type._spectra; by &grep_sample_name; run; /* Now we have a data set &ds_type._SPECTRA which is structured something as follows: */ /* Variables: &grep_sample_name (e.g., SAMPLE_ID) */ /* &grep_spectra_name (e.g., SPECTRUM_ID) */ /* Values: SAMPLE_ID SPECTRUM_ID */ /* EV0628 EV0628_1 */ /* EV0628 EV0628_2 */ /* SA291 SA291_1 */ /* SA291 SA291_2 */ /* ... ... */ /* */ /* From the data values, we construct a new &ds_type data set employing the following */ /* code: */ /* data __case; */ /* set &case; */ /* EV0628 = median(EV0628_1, EV0628_2); */ /* SA291 = median(SA291_1, SA291_2); */ /* ... */ /* keep mz EV0628 SA291; * Note that mz is the value of macro var &gfirstvar; */ /* run; */ /* */ /* This writing of code from values of the data is what is accomplished in this next */ /* code block. */ /* First, construct a list of replicate spectra names. The replicate spectra will */ /* not be saved in the output data set. We will only keep the spectra which */ /* are obtained when we apply the reduction operation. */ proc sql noprint; select &grep_spectra_name into: spectra_names separated by " " from &ds_type._spectra; quit; options xsync noxwait; proc printto log='________.___'; run; data _null_; set &ds_type._spectra end=lastrec; by &grep_sample_name; if _n_=1 then do; call execute("data __&ds_type.;"); call execute(" set &&indat;"); end; if first.&grep_sample_name then call execute(" " || &grep_sample_name || " = &greduc_method.(" || &grep_spectra_name); else call execute(", " || &grep_spectra_name); if last.&grep_sample_name then call execute(");"); if lastrec then do; call execute(" drop &spectra_names;"); call execute("run;"); end; run; proc printto log=log; run; x 'del ________.___'; proc sort data=__&ds_type; by &gfirstvar; run; %end; %mend ds_vars; /* Utility macro CVsplit_type */ %macro CVsplit_type1(indat=, ds_type=); /**************************************************************/ /* Purpose: Code employed to split a file containing */ /* independent spectra into 10 equally sized */ /* segments. If the number of spectra is not a */ /* multiple of 10, perfect equality in the number */ /* of spectra per segment cannot be obtained. */ /* However, the differences in number of spectra */ /* for will be only 1 between the largest and */ /* smallest segments. */ /* The 10 segments will be employed in constructing */ /* cross-validation results */ /* */ /* Inputs: */ /* INDAT The name of a macro variable which resolves to */ /* the name of a data set containing spectral data */ /* DS_TYPE Specifies whether the data set is a CASE or */ /* CONTROL data set */ /* */ /* Outputs: Data sets __CASE1, __CASE2, ..., __CASE10 for */ /* DS_TYPE=CASE. Similar constructions for CONTROL */ /* DS_TYPE specification. Each of the data sets is */ /* of approximately equal size. In addition to the */ /* 10 data sets, 10 data sets C_CASE1, C_CASE2, */ /* ..., C_CASE10 are created. These sets are the */ /* complements to CASE1, CASE2, ..., CASE10 which */ /* means that they contain all the data in __CASE */ /* except for the data __CASEi for C_CASEi. */ /**************************************************************/ %global grep_sample_name grep_spectra_name gfirstvar gseed; /* First, generate a list of spectra in the input data */ /* set named in INDAT. */ proc contents data=&indat(drop=&gfirstvar) out=specnames(keep=name rename=(name=specnames)) noprint; run; /* Next, construct a split of the DS_TYPE data set __&DSTYPE. */ /* Attach a random number to each row of spectrum names file. */ data split; retain seed &gseed; set specnames end=lastrec; call ranuni(seed, x); if lastrec then call symput("gseed", put(seed,32.)); run; /* Sort by the random number */ proc sort data=split; by x; run; /* For randomized spectra names, identify first 10% */ /* as group 1, next 10% as group 2, etc. */ data split; set split nobs=nobs; %do i=1 %to 10; %if &i^=1 %then else; if round(%eval(&i-1)*nobs/10) < _n_ <= round(&i*nobs/10) then group=&i; %end; run; /* Construct macro variables listing spectra in each group */ proc sql noprint; %do i=1 %to 10; select specnames into: group_&i separated by " " from split where group=&i; %end; quit; /* Split the data set __case into 10 groups where each group */ /* has only the variables associated with that group */ data %do i=1 %to 10; __&ds_type.&i.(keep=&gfirstvar &&group_&i)%end;; set __&ds_type.; %do i=1 %to 10; output __&ds_type.&i; %end; run; /* For each of the cross-validation data sets, construct a */ /* corresponding complement data set which sets data for */ /* all cross-validation sets except the one named. */ %do i=1 %to 10; data C_&ds_type.&i; merge %do j=1 %to 10; %if &j^=&i %then __&ds_type.&j; %end;; run; %end; %mend CVsplit_type1; /* Utility macro CVsplit_type2 */ %macro CVsplit_type2(indat=, ds_type=); /**************************************************************/ /* Purpose: The purpose is the same as for CVsplit_type1, */ /* but here we have non-independent spectra which */ /* must be randomized together in a single segment */ /* of the cross-validation data. */ /* */ /* Inputs: */ /* INDAT The name of a macro variable which resolves to */ /* the name of a data set containing spectral data */ /* DS_TYPE Specifies whether the data set is a CASE or */ /* CONTROL data set */ /* */ /* Outputs: Data sets __CASE1, __CASE2, ..., __CASE10 for */ /* DS_TYPE=CASE. Similar constructions for CONTROL */ /* DS_TYPE specification. Each of the data sets is */ /* of approximately equal size. In addition to the */ /* 10 data sets, 10 data sets C_CASE1, C_CASE2, */ /* ..., C_CASE10 are created. These sets are the */ /* complements to CASE1, CASE2, ..., CASE10 which */ /* means that they contain all the data in __CASE */ /* except for the data __CASEi for C_CASEi. */ /**************************************************************/ %global grep_sample_name grep_spectra_name gfirstvar gDATA_OK gseed; %let ds_type=%upcase(&ds_type); %if &ds_type=CASE %then %let OK_TEST=1; %else %if &ds_type=CONTROL %then %let OK_TEST=2; %let data_ok&OK_TEST=1; /* First, generate a list of spectra in the input data */ /* set named in INDAT. */ proc contents data=&&indat(drop=&gfirstvar) out=specnames(keep=name rename=(name=&grep_spectra_name)) noprint; run; /* Merge those spectra with the replicate key file to obtain */ /* a list of associated spectra. */ proc sort data=specnames; by &grep_spectra_name; run; data specnames; merge specnames(in=a) rep_keyfile(in=b); by &grep_spectra_name; if a; if ^b then call symput("data_ok&OK_TEST", "0"); run; %if &data_ok&OK_TEST=0 %then %do; %put Spectra found in the &dstype data set &&indat which are not coded in the replicate key file; %put &rep_keyfile. THE boosting macro will abort.; %end; %else %do; proc sort data=specnames out=specnames_samples nodupkey; by &grep_sample_name; run; data specnames_samples; retain seed &gseed; set specnames_samples end=lastrec; call ranuni(seed,x); if lastrec then call symput("gseed", put(seed,16.)); run; proc sort data=specnames_samples; by x; run; data specnames_samples; set specnames_samples nobs=nobs; %do i=1 %to 10; %if &i^=1 %then else; if round(%eval(&i-1)*nobs/10) < _n_ <= round(&i*nobs/10) then group=&i; %end; run; proc sort data=specnames_samples; by &grep_sample_name; run; data specnames; merge specnames(in=a) specnames_samples(in=b); by &grep_sample_name; run; proc sql noprint; %do i=1 %to 10; select &grep_spectra_name into: group_&i separated by " " from specnames where group=&i; %end; quit; /* Split the data set __&ds_type. into 10 groups where each */ /* group has only the variables associated with that group */ data %do i=1 %to 10; __&ds_type.&i.(keep=&gfirstvar &&group_&i)%end;; set __&ds_type.; %do i=1 %to 10; output __&ds_type.&i; %end; run; /* For each of the cross-validation data sets, construct a */ /* corresponding complement data set which sets data for */ /* all cross-validation sets except the one named. */ %do i=1 %to 10; data C_&ds_type.&i; merge %do j=1 %to 10; %if &j^=&i %then __&ds_type.&j; %end;; run; %end; %end; %let gDATA_OK=%sysfunc(min(&gDATA_OK, &data_ok&OK_TEST)); %mend CVsplit_type2; /* Utility macro FILTER */ %macro filter(ds_case=, ds_control=, ds_test1=, ds_test2=, cv=); /********************************************************************/ /* Purpose: Determine which rows of the input case and control */ /* data have sufficient marginal utility to be */ /* considered in the boosting classifier. */ /* */ /* If cross-validation is requested as a stopping */ /* rule, then the filter should be applied to each */ /* cross-validation set to be employed. */ /* */ /* Inputs: */ /* DS_CASE Names the data set containing case observations */ /* DS_CONTROL Names the data set containing control observations */ /* DS_TEST1 Names a data set that tests a classifier */ /* constructed from DS_CASE and DS_CONTROL data */ /* DS_TEST2 Names a second data set that tests a classifier */ /* constructed from DS_CASE and DS_CONTROL data */ /* CV Cross-validation set (CV=YES) or complete case/ */ /* control data set (CV=NO)? */ /* */ /* */ /* Two test data sets are required to accomodate the cross- */ /* validation data sets __case1 and __control1 where we do want to */ /* keep case and control information separately. We use only one */ /* test data set when there is a test data set consisting of both */ /* case and control information without label as to which spectrum */ /* is of which status. */ /********************************************************************/ %global gfirstvar; proc contents data=&ds_case(drop=&gfirstvar) out=case_names(keep=name) noprint; run; proc contents data=&ds_control(drop=&gfirstvar) out=control_names(keep=name) noprint; run; proc sql noprint; select name into: case_names separated by " " from case_names; select name into: control_names separated by " " from control_names; quit; data peak_stack / view=peak_stack; merge &ds_case(in=a) &ds_control(in=b); by &gfirstvar; array case_vars {*} &case_names; array cont_vars {*} &control_names; case_cont=1; case_n = dim(case_vars); cont_n = dim(cont_vars); do i=1 to case_n; %if &wgt_anal=YES %then %do; wgt = cont_n / case_n; %end; X = case_vars{i}; output; end; case_cont=0; do i=1 to cont_n; %if &wgt_anal=YES %then %do; wgt = 1; %end; X = cont_vars{i}; output; end; keep &gfirstvar case_cont X %if &wgt_anal=YES %then %do; wgt %end; ; run; *options mprint; /* Fit logistic regression, outputting ROC curve values */ options nonotes; ods listing close; ods output parameterestimates=parms; * ods output globaltests=LRtest(where=(Test='Likelihood Ratio')); proc logistic data=peak_stack descending; %if &wgt_anal=YES %then %do; weight wgt; %end; by &gfirstvar; model case_cont = X / outroc=roc roceps=1E-128; run; ods listing; options notes; /* Compute AUC and partial AUC where partial AUC areas */ /* are defined as the area to the left of (1-SPEC)<=c1 */ /* and area to the right of (1-SPEC) at point where */ /* SENS>c2. Currently, c1 and c2 are hard coded as */ /* 0.2 and 0.7, respectively. */ data auc; retain auc auc1 auc2 l_1mspec_ l_sensit_ area1 area2 _1mspec_at_pt7 max_sensspec cp_sensit cp_spec; set roc; by &gfirstvar descending _prob_; if first.&gfirstvar then do; area1=1; area2=0; l_1mspec_=0; l_sensit_=0; auc = 0; auc1 = 0; auc2 = 0; end; sensspec=_sensit_ - _1mspec_; max_sensspec = max(max_sensspec,sensspec); if sensspec=max_sensspec then do; cp_sensit = _sensit_; cp_spec = 1 - _1mspec_; end; auc + ((_sensit_+l_sensit_)/2)*(_1mspec_-l_1mspec_); if area1 then auc1 + ((_sensit_+l_sensit_)/2)*(_1mspec_-l_1mspec_); if area2 then auc2 + ((_sensit_+l_sensit_)/2)*(_1mspec_-l_1mspec_); if _1mspec_>=&cv_fpr & area1=1 then do; auc1 + -((_sensit_+l_sensit_)/2)*(_1mspec_-l_1mspec_); propdist = (&cv_fpr-l_1mspec_)/(_1mspec_-l_1mspec_); _sensit_at_pt2 = l_sensit_ + propdist*(_sensit_-l_sensit_); auc1 + ((_sensit_at_pt2+l_sensit_)/2)*(&cv_fpr-l_1mspec_); area1=0; end; if l_sensit_<=&cv_tpr & _sensit_>&cv_tpr then do; propdist = (&cv_tpr-l_sensit_)/(_sensit_-l_sensit_); _1mspec_at_pt7 = l_1mspec_ + propdist*(_1mspec_-l_1mspec_); auc2 + ((_sensit_+&cv_tpr)/2)*(_1mspec_-_1mspec_at_pt7); area2=1; end; l_1mspec_=_1mspec_; l_sensit_=_sensit_; if last.&gfirstvar then output; keep &gfirstvar auc auc1 auc2 _1mspec_at_pt7 /*cutpoint*/ max_sensspec cp_sensit cp_spec /*rule*/; label auc='Total area under the curve' auc1="Partial area under the curve to FPR=&cv_fpr" auc2="Partial AUC to the right of FPR at which TPR=&cv_tpr" _1mspec_at_pt7="FPR at TPR=&cv_tpr"; run; %if &print=YES & &cv=NO %then %do; title "Distributions of AUC and partial AUC values across all candidate predictors"; title2 "Case data set: &case Control data set: &control"; proc univariate data=auc plot; var auc auc1 auc2; run; %end; /* If we want the best J percent of one or more of these statistics, */ /* then determine the (100-J) percentiles of AUC, AUC1, and AUC2. */ proc sort data=auc(keep=auc) out=auc_filter; by descending auc; run; data auc_filter(rename=(auc=auc_filter)); if 0 then set auc_filter nobs=nobs; _filter=int(&auc_pct*nobs/100); if _filter>0 then set auc_filter(keep=auc) point=_filter; else auc=1.0; output; stop; run; proc sort data=auc(keep=auc1) out=auc1_filter; by descending auc1; run; data auc1_filter(rename=(auc1=auc1_filter)); if 0 then set auc1_filter nobs=nobs; _filter=int(&auc1_pct*nobs/100); if _filter>0 then set auc1_filter(keep=auc1) point=_filter; else auc1=1.0; output; stop; run; proc sort data=auc(keep=auc2) out=auc2_filter; by descending auc2; run; data auc2_filter(rename=(auc2=auc2_filter)); if 0 then set auc2_filter nobs=nobs; _filter=int(&auc2_pct*nobs/100); if _filter>0 then set auc2_filter(keep=auc2) point=_filter; else auc2=1.0; output; stop; run; /* Identify observations which are above the Jth */ /* percentile on any one of the AUC statistics. */ data auc; retain auc_filter auc1_filter auc2_filter; set auc; by &gfirstvar; if _n_=1 then do; set auc_filter; set auc1_filter; set auc2_filter; end; filter_auc = (auc>=auc_filter); filter_auc1 = (auc1>=auc1_filter); filter_auc2 = (auc2>=auc2_filter); filter = (sum(filter_auc, filter_auc1, filter_auc2)>=1); label filter_auc="AUC in top &auc_pct.%" filter_auc1="Partial AUC for high specificity in top &auc1_pct.%" filter_auc2="Partial AUC for high sensitivity in top &auc2_pct.%"; keep &gfirstvar filter filter_auc filter_auc1 filter_auc2; run; /* Reconstruct case and control datasets using */ /* only those points which meet the filter. */ data &ds_case; merge &ds_case auc(keep=filter &gfirstvar); by &gfirstvar; if filter; output; drop filter; run; data &ds_control; merge &ds_control auc(keep=filter &gfirstvar); by &gfirstvar; if filter; output; drop filter; run; %if &ds_test1^=%str() %then %do; data &ds_test1; merge &ds_test1 auc(keep=filter &gfirstvar); by &gfirstvar; if filter; output; drop filter; run; %end; %if &ds_test2^=%str() %then %do; data &ds_test2; merge &ds_test2 auc(keep=filter &gfirstvar); by &gfirstvar; if filter; output; drop filter; run; %end; %mend filter; /* Utility macro ERR */ %macro err(direction=); /* Need to describe */ do k=1 to ncol(cp_assign); j = cp_assign[,k]; %if &direction=INCREASING %then %do; predicted = (peakvals<=cp[,j]); %end; %else %do; predicted = (peakvals>=cp[,j]); %end; err[,j] = (predicted^=posrspns)`*peakwts[,j]; err[,j] = err[,j]/peakwts[+,j]; if err[,j]=0 then err[,j] = 1/(2*nvec[,j]); end; %mend err; %global gfirstvar grep_sample_name grep_spectra_name greduc_method gDATA_OK gseed; %let grep_spectra_name=&rep_spectra_name; %let grep_sample_name=&rep_sample_name; %let greduc_method=&reduc_method; %let gseed=&seed; %local i j k l; %let boost=%upcase(&boost); %let rep_spectra=%upcase(&rep_spectra); %let rep_reduction=%upcase(&rep_reduction); %let reduc_method=%upcase(&reduc_method); %let method=%upcase(&method); %let criterion=%upcase(&criterion); %let partial_criterion=%upcase(&partial_criterion); %let wgt_anal=%upcase(&wgt_anal); %let filter=%upcase(&filter); %let cv=%upcase(&cv); %let print=%upcase(&print); %if &rep_reduction=YES %then %do; %if &reduc_method^=MEAN & &reduc_method^=MEDIAN %then %do; %put If you wish to perform replicate reduction, then you must specify a valid; %put reduction method, either MEAN or MEDIAN. The boosting macro will abort.; %goto stop_mac; %end; %end; %if &cv=NO & &iters=0 %then %let iters=100; /* Initial data integrity tests */ /* Determine existence of case and control data sets as well */ /* as test data set if a test data set has been named. */ /* If the data do not exist, write a message and abort. */ %let DATA_OK=1; %if %sysfunc(exist(&case))=0 %then %do; %put The case data set &CASE does not exist. The boosting macro will abort.; %let DATA_OK=0; %end; %if %sysfunc(exist(&control))=0 %then %do; %put The control data set &CONTROL does not exist. The boosting macro will abort.; %let DATA_OK=0; %end; %if &test^=%str() %then %do; %if %sysfunc(exist(&test))=0 %then %do; %put The test data set &TEST does not exist. The boosting macro will abort.; %let DATA_OK=0; %end; %end; %if &DATA_OK=0 %then %goto stop_mac; /* Identify the name of the variable in */ /* the first column of the case data. */ proc contents data=&case out=varnames(keep=name varnum) noprint; run; proc sort data=varnames; by varnum; run; data _null_; i=1; set varnames point=i; call symput("firstvar", name); stop; run; /* Create global version of FIRSTVAR. Global */ /* version is accessible to other macros. */ %let gfirstvar=&firstvar; /* Verify that the name of the first variable in control */ /* and test data sets are the same as for the case data. */ proc contents data=&control out=varnames(keep=name varnum) noprint; run; proc sort data=varnames; by varnum; run; data _null_; i=1; set varnames point=i; call symput("firstvar_cont", name); stop; run; %if &firstvar_cont^=&firstvar %then %do; %put The first variable in the case and control data sets should have exactly the same; %put interpretation. The name of the first variable in the case data set %upcase(&case) is; %put %upcase(&firstvar) while the name of the first variable in the control data set %upcase(&control); %put is %upcase(&firstvar_cont). Please construct your data so that the first column in each data; %put set is identical and use a consistent name for this first column.; %put ; %put The boosting macro has been aborted.; %let DATA_OK=0; %end; %if &test^=%str() %then %do; proc contents data=&test out=varnames(keep=name varnum) noprint; run; proc sort data=varnames; by varnum; run; data _null_; i=1; set varnames point=i; call symput("firstvar_test", name); stop; run; %if &firstvar_test^=&firstvar %then %do; %put The first variable in the case and test data sets should have exactly the same; %put interpretation. The name of the first variable in the case data set %upcase(&case) is; %put %upcase(&firstvar) while the name of the first variable in the test data set %upcase(&test); %put is %upcase(&firstvar_cont). Please construct your data so that the first column in each data; %put set is identical and use a consistent name for this first column.; %put ; %put The boosting macro has been aborted.; %let DATA_OK=0; %end; %end; %if &DATA_OK=0 %then %goto stop_mac; /* Now check that the number of records in the input */ /* data sets CASE, CONTROL, and TEST are the same. */ data _null_; if 0 then set &case nobs=nobs; call symput("Nobs_case", put(nobs, 10.)); stop; run; data _null_; if 0 then set &control nobs=nobs; call symput("Nobs_cont", put(nobs, 10.)); stop; run; %if &Nobs_case^=&Nobs_cont %then %do; %put The number of records in the input data sets represents the number of candidate variables for; %put classifier construction. Therefore, the number of records in the case data set %upcase(&case) and; %put the control data set %upcase(&control) must match. There are %sysfunc(compress(&NOBS_CASE)) records in the case data set; %put and %sysfunc(compress(&NOBS_CONT)) records in the control data set. The boosting macro will abort.; %let DATA_OK=0; %end; %if &test^=%str() %then %do; data _null_; if 0 then set &test nobs=nobs; call symput("Nobs_test", put(nobs, 10.)); stop; run; %if &Nobs_case^=&Nobs_cont %then %do; %put The number of records in the input data sets represents the number of candidate variables for; %put classifier construction. Therefore, the number of records in the case data set %upcase(&case) and; %put the test data set %upcase(&test) must match. There are %sysfunc(compress(&NOBS_CASE)) records in the case data set; %put and %sysfunc(compress(&NOBS_test)) records in the test data set. The boosting macro will abort.; %let DATA_OK=0; %end; %end; %if &DATA_OK=0 %then %goto stop_mac; /* Determine whether the user indicated that there are replicate spectra */ /* in the data. If there are replicate spectra and replicate reduction */ /* is to be conducted, construct new CASE, CONTROL, and TEST data set */ /* with mean or median values as required. */ %if &rep_spectra=YES & &rep_reduction=YES %then %do; %if &rep_keyfile=%str() %then %do; %put Replicate reduction is specified by the macro variable specification; %put REP_SPECTRA=YES. However, no replicate key file was named. You must; %put specify a replicate key file in order to perform replicate reduction.; %put The boosting macro will abort.; %let DATA_OK=0; %end; %else %do; /* First determine existence of replicate key file. If */ /* it does not exist, then write a message and abort. */ %if %sysfunc(exist(&rep_keyfile))=0 %then %do; %put The replicate key file &rep_keyfile does not exist. The boosting macro will abort.; %let DATA_OK=0; %end; %else %do; /* Replicate key file exists. Now, check existence of */ /* replicate sample and spectra names on REP_KEYFILE */ %let dsid=%sysfunc(open(&rep_keyfile,i)); %let varnum=%sysfunc(varnum(&dsid,&rep_sample_name)); %if &varnum=0 %then %do; %put The sample name variable &rep_sample_name does not exist in the replicate key file.; %put The boosting macro will abort.; %let DATA_OK=0; %end; %let varnum=%sysfunc(varnum(&dsid, &rep_spectra_name)); %if &varnum=0 %then %do; %put The spectrum name variable &rep_spectra_name does not exist in the replicate key file.; %put The boosting macro will abort.; %let DATA_OK=0; %end; %let rc=(close(&dsid)); %if &DATA_OK=0 %then %goto stop_mac; /* Sample and spectra name variables exist on REP_KEYFILE. */ /* Construct a list of spectra names from CASE, CONTROL and */ /* the TEST data set (if TEST was named) and identify which */ /* spectra in REP_SPECTRA_NAMES belong to which data sets. */ proc sort data=&rep_keyfile out=rep_keyfile; by &rep_spectra_name; run; %let gDATA_OK=&DATA_OK; %ds_vars(indat=&case, ds_type=case) %let DATA_OK=&gDATA_OK; %ds_vars(indat=&control, ds_type=control) %let DATA_OK=&gDATA_OK; %if &test^=%str() %then %do; %ds_vars(indat=&test, ds_type=test) %let DATA_OK=&gDATA_OK; %end; %end; %end; %end; %if &DATA_OK=0 %then %goto stop_mac; /* If replicate reduction was not done, construct a local copy of the */ /* CASE, CONTROL, and TEST data sets named __CASE, __CONTROL, and __TEST. */ %if &rep_reduction=NO %then %do; proc sort data=&case out=__case; by &firstvar; run; proc sort data=&control out=__control; by &firstvar; run; %if &test^=%str() %then %do; proc sort data=&test out=__test; by &firstvar; run; %end; %end; /* If we are to use a cross-validation stopping rule, then */ /* construct partions of the CASE and CONTROL data sets */ /* to use for the 10-fold cross-validation sample sets. */ %if &cv=YES %then %do; %if &rep_reduction=YES %then %do; /* Split the already existing case data __CASE */ %CVsplit_type1(indat=__case, ds_type=case) /* Split the already existing control data __CONTROL */ %CVsplit_type1(indat=__control, ds_type=control) %end; /* Alternative constructions if replicate reduction is turned off */ %else %if &rep_reduction=NO %then %do; %if &rep_keyfile=%str() %then %do; /****************************************************************/ /* Case 1: There is no replicate key file, so all spectra */ /* are treated as completely independent. */ /****************************************************************/ /* Now invoke the cross-validation split for independent type data */ %CVsplit_type1(indat=__case, ds_type=case) %CVsplit_type1(indat=__control, ds_type=control) %end; %else %do; /****************************************************************/ /* Case 2: There is a replicate key file, and replicates */ /* must be randomized together. */ /****************************************************************/ /* We will merge spectra names from case and control data sets */ /* with spectra names on the replicate key file. Sort the */ /* replicate key file first for merges with case and control */ /* data sets at a later time. */ proc sort data=&rep_keyfile out=rep_keyfile; by &rep_spectra_name; run; %CVsplit_type2(indat=&case, ds_type=case) %CVsplit_type2(indat=&control, ds_type=control) %end; %end; %end; /* If peak filtering has been requested, then we apply those peak filters now */ %if &filter=YES %then %do; /**************************************************************/ /* */ /* First step in constructing filters is to rewrite the data */ /* such that */ /* 1) Case and control data are combined in a single set */ /* with an indicator of case/control status */ /* 2) Columns of the original input data (which represent */ /* spectra or subjects) are written as rows in the data */ /* set passed to the logistic procedure. */ /* */ /* That is, our original data are as follows: */ /* */ /* Case data set Control data set */ /* -------------------------- --------------------------- */ /* m/z case1 case2 ... caseK m/z cont1 cont2 ... contL */ /* mz1 ca1_1 ca2_1 caK_1 mz1 co1_1 co2_1 coL_1 */ /* mz2 ca1_2 ca2_2 caK_2 mz2 co1_2 co2_2 coL_2 */ /* ... */ /* mzN ca1_N ca2_N caK_N mzN co1_N co2_N coL_N */ /* */ /* */ /* These data need to be rewritten as */ /* */ /* m/z case_cont X */ /* mz1 1 ca1_1 */ /* mz1 1 ca2_1 */ /* mz1 ... ... */ /* mz1 1 caK_1 */ /* mz1 0 co1_1 */ /* mz1 0 co2_1 */ /* mz1 ... ... */ /* mz1 0 coL_1 */ /* mz2 1 ca1_2 */ /* mz2 1 ca2_2 */ /* mz2 ... ... */ /* mz2 1 caK_2 */ /* mz2 0 co1_2 */ /* mz2 0 co2_2 */ /* mz2 ... ... */ /* mz2 0 coL_2 */ /* ... ... ... */ /* mzN 1 ca1_N */ /* mzN 1 ca2_N */ /* mzN ... ... */ /* mzN 1 caK_N */ /* mzN 0 co1_N */ /* mzN 0 co2_N */ /* mzN ... ... */ /* mzN 0 coL_N */ /* */ /* */ /* Thus, we just need to write out each row of the data with */ /* an output record for each column of the original case and */ /* control data sets. We also need to write an indicator */ /* variable which records whether the column was a case or */ /* control data set. For that, we need the names of all the */ /* columns in a particular data set. */ /**************************************************************/ %if &test=%str() %then %do; %filter(ds_case=__case, ds_control=__control, cv=NO); %end; %else %do; %filter(ds_case=__case, ds_control=__control, ds_test1=__test, cv=NO); %end; %if &CV=YES %then %do; %do i=1 %to 10; %filter(ds_case=C_case&i, ds_control=C_control&i, ds_test1=__case&i, ds_test2=__control&i, cv=yes); %end; %end; %end; /* Write out the data obtained from applying any data */ /* reduction and filtering step, if requested. */ %if &case_out^=%str() %then %do; data &case_out; set __case; run; %end; %if &control_out^=%str() %then %do; data &control_out; set __control; run; %end; %if &test_out^=%str() %then %do; data &test_out; set __test; run; %end; *options mprint; /* Construct the list of case, control, and test spectra. */ /* Only these variables will be read into the design. */ proc contents data=__case(drop=&firstvar) out=vars_case(keep=name varnum) noprint; run; proc sort data=vars_case; by varnum; run; data _null_; set vars_case end=lastrec; call symput("Case_name"||put(_n_,5.-L), name); if lastrec then call symput("NCase_names", left(put(_n_,16.))); run; proc contents data=__control(drop=&firstvar) out=vars_control(keep=name varnum) noprint; run; proc sort data=vars_control; by varnum; run; data _null_; set vars_control end=lastrec; call symput("Cont_name"||put(_n_,5.-L), name); if lastrec then call symput("NControl_names", left(put(_n_,16.))); run; %if &test^=%str() %then %do; proc contents data=__test(drop=&firstvar) out=vars_test(keep=name varnum) noprint; run; proc sort data=vars_test; by varnum; run; data _null_; set vars_test end=lastrec; call symput("test_name"||put(_n_,5.-L), name); if lastrec then call symput("NTest_names", left(put(_n_,16.))); run; %end; %if &cv=YES %then %do; %do i=1 %to 10; proc contents data=__case&i(drop=&firstvar) out=vars__case&i.(keep=name varnum) noprint; run; proc sort data=vars__case&i; by varnum; run; proc contents data=C_case&i(drop=&firstvar) out=varsC_case&i(keep=name varnum) noprint; run; proc sort data=varsC_case&i; by varnum; run; proc contents data=__control&i(drop=&firstvar) out=vars__control&i.(keep=name varnum) noprint; run; proc sort data=vars__control&i; by varnum; run; proc contents data=C_control&i(drop=&firstvar) out=varsC_control&i(keep=name varnum) noprint; run; proc sort data=varsC_control&i; by varnum; run; %end; %end; proc sql noprint; select name into: vars_case separated by " " from vars_case; select name into: vars_control separated by " " from vars_control; %if &test^=%str() %then %do; select name into: vars_test separated by " " from vars_test; %end; %if &cv=YES %then %do; %do i=1 %to 10; select name into: vars__case&i separated by " " from vars__case&i; select name into: varsC_case&i separated by " " from varsC_case&i; %end; %end; %if &cv=YES %then %do; %do i=1 %to 10; select name into: vars__control&i separated by " " from vars__control&i; select name into: varsC_control&i separated by " " from varsC_control&i; %end; %end; quit; data _null_; if 0 then set __case nobs=nobs; call symput("NCandidates", put(nobs,10.)); stop; run; %if &method=DISCRETE %then %do; %if &criterion=AUC %then title "Boosting method: decision tree maximizing total AUC"; %else %if &criterion=PARTIAL_AUC & &partial_criterion=SPEC %then title "Boosting method: decision tree maximizing partial AUC to FPR &partial_critval"; %else %if &criterion=PARTIAL_AUC & &partial_criterion=SENS %then title "Boosting method: decision tree maximizing partial AUC after TPR=&partial_critval"; %else %if &criterion=MIN_ERR | &criterion=MINERR %then title "Boosting method: decision tree maximizing total correctly predicted"; %else %if &criterion=SENSSPEC %then title "Boosting method: decision tree maximizing sensitivity + specificity"; ; %end; %else %do; title "Boosting method: logistic regression"; %end; title2 "Case data set: &case Control data set: &control"; title3 "Number of candidate variables to employ in classifier: &NCandidates"; %if &test^=%str() & &test_out^=%str() %then %do; title4 "Linear predictor for test data &test written to data set &test_out"; %end; %if &filter=YES %then %do; title5 "Total AUC filtering: best &auc_pct percent "; title6 "Partial AUC filtering(1): best &auc1_pct percent of area below FPR=&cv_fpr "; title7 "Partial AUC filtering(2): best &auc2_pct percent of area above FPR at TPR=&cv_tpr"; %end; %if &wgt_anal=YES %then %do; %if &filter=YES %then %do; title8 "Weighted to achieve equal case/control population frequency"; %end; %else %do; title5 "Weighted to achieve equal case/control population frequency"; %end; %end; %if &boost=YES %then %do; proc iml; /************************************************/ /* */ /* Code to execute if our objective function is */ /* the (weighted) maximum correct predictions. */ /* Need to consider predictions under rules */ /* */ /* PREDi = (x>=Xi) */ /* PREDi = (x<=Xi) */ /* */ /************************************************/ start min_err; /* Compute minimum error rate */ correct_inc = (yx[,1]=0)`*peakwts; correct_dec = (yx[,1]=1)`*peakwts; max_correct_dec = correct_dec; max_correct_inc = correct_inc; prev_wts = zero_vec; prev_y = 0; cp_inc = missing; cp_dec = missing; do i=1 to nobs; correct_dec = correct_dec + prev_wts*((prev_y=0) - (prev_y=1)); correct_inc = correct_inc + peakwts[i,]*((posrspns[i,]=1) - (posrspns[i,]=0)); prev_wts = peakwts[i,]; prev_y = posrspns[i,]; max_correct_dec = (max_correct_dec // correct_dec)[<>,]; max_correct_inc = (max_correct_inc // correct_inc)[<>,]; cp_assign = loc((correct_dec=max_correct_dec)#(peakwts[i,]>0)); if nrow(cp_assign)>0 then do; cp_dec[,cp_assign] = peakvals[i,]; end; cp_assign = loc((correct_inc=max_correct_inc)#(peakwts[i,]>0)); if nrow(cp_assign)>0 then do; cp_inc[,cp_assign] = peakvals[i,]; end; end; finish; start auc_inc; auc_inc = auc_inc + ((sens+sens_last)/2)#(spec_last - spec); %if &criterion=PARTIAL_AUC %then %do; %if &partial_criterion=SPEC %then %do; lpart_area = loc(part_area); if ncol(lpart_area)>0 then do; part_auc_inc[,lpart_area] = auc_inc[,lpart_area]; lpart_area_off = loc((1-spec)>&partial_critval.); if ncol(lpart_area_off)>0 then do; part_area[,lpart_area_off]=0; do k=1 to ncol(wts); if spec_last[,k]=spec[,k] then propdist[,k]=0; else propdist[,k] = ((&partial_critval.-(1-spec[,k]))/(spec_last[,k] - spec[,k])); end; sens_at_pcv = (sens_last + propdist#(sens - sens_last)); part_auc_inc[,lpart_area_off] = ( part_auc_inc - ((sens+sens_last)/2)#(spec_last - spec) + ((sens_at_pcv+sens_last)/2)#(&partial_critval.-(1-spec_last)) )[,lpart_area_off]; end; end; %end; %else %do; lpart_area = loc(part_area); if ncol(lpart_area)>0 then do; part_auc_inc[,lpart_area] = (part_auc_inc + ((sens+sens_last)/2)#(spec_last - spec)) [,lpart_area]; end; if ncol(lpart_area)&partial_critval.)); part_area[,assign_part_area]=1; do k=1 to ncol(wts); if sens_last[,k]=sens[,k] then propdist[,k]=0; else propdist[,k] = ((&partial_critval.-(1-sens[,k]))/(sens_last[,k] - sens[,k])); end; _1mspec_at_pcv = (1-spec_inc_last) + propdist#(spec_inc_last-spec); part_auc_inc[,assign_part_area] = ( part_auc_inc + ((sens+&partial_critval.)/2)#((1-spec)-_1mspec_at_pcv) )[,assign_part_area]; end; %end; %end; sens_last = sens; spec_last = spec; finish; start auc_dec; auc_dec = auc_dec + ((sens+sens_last)/2)#(spec_last - spec); %if &criterion=PARTIAL_AUC %then %do; %if &partial_criterion=SPEC %then %do; lpart_area = loc(part_area); if ncol(lpart_area)>0 then do; part_auc_dec[,lpart_area] = auc_dec[,lpart_area]; lpart_area_off = loc((1-spec)>&partial_critval.); if ncol(lpart_area_off)>0 then do; part_area[,lpart_area_off]=0; do k=1 to ncol(wts); if spec_last[,k]=spec[,k] then propdist[,k]=0; else propdist[,k] = ((&partial_critval.-(1-spec[,k]))/(spec_last[,k] - spec[,k])); end; sens_at_pcv = (sens_last + propdist#(sens - sens_last)); part_auc_dec[,lpart_area_off] = ( part_auc_dec - ((sens+sens_last)/2)#(spec_last - spec) + ((sens_at_pcv+sens_last)/2)#(&partial_critval.-(1-spec_last)) )[,lpart_area_off]; end; end; %end; %else %do; lpart_area = loc(part_area); if ncol(lpart_area)>0 then do; part_auc_dec[,lpart_area] = (part_auc_dec + ((sens+sens_last)/2)#(spec_last - spec)) [,lpart_area]; end; if ncol(lpart_area)&partial_critval.)); part_area[,assign_part_area]=1; do k=1 to ncol(wts); if sens_last[,k]=sens[,k] then propdist[,k]=0; else propdist[,k] = ((&partial_critval.-(1-sens[,k]))/(sens_last[,k] - sens[,k])); end; _1mspec_at_pcv = (1-spec_dec_last) + propdist#(spec_dec_last-spec); part_auc_dec[,assign_part_area] = ( part_auc_dec + ((sens+&partial_critval.)/2)#((1-spec)-_1mspec_at_pcv) )[,assign_part_area]; end; %end; %end; sens_last = sens; spec_last = spec; finish; /**************************************************************/ /* */ /* Code to execute if our objective function is the */ /* (weighted) a function of sensitivity and specificity. */ /* These functions include: */ /* */ /* Total area under the curve (AUC) */ /* Partial area under the curve (PARTIAL_AUC) */ /* Weighted sum of sensitivity and specificity (SENSSPEC) */ /* */ /* Partial area under the curve can be specified as the */ /* area under the curve for 1-specificity between 0 and */ /* partial area criterion value (PARTIAL_CRITVAL) where */ /* 0<=PARTIAL_CRITVAL)<1. Alternatively, partial area */ /* under the curve can be specified as the area from the */ /* point at which sensitivity achieves the value */ /* PARTIAL_CRITVAL to sensitivity=1. The macro variable */ /* PARTIAL_CRITERION determines whether the partial area is */ /* defined in terms of 1-specificity (PARTIAL_CRITERION=SPEC) */ /* or minimum sensitivity (PARTIAL_CRITERION=SENS). */ /* */ /* Need to consider predictions under rules */ /* */ /* PREDi = (x>=Xi) */ /* PREDi = (x<=Xi) */ /* */ /**************************************************************/ start auc; /* Compute denominators for sensitivity and specificity */ tot_pos = posrspns`*peakwts; tot_neg = (1-posrspns)`*peakwts; /* Initialization for computing sensitivity and specificity */ /* for prediction rule PREDi = (x<=Xi) */ pos = zero_vec; neg = zero_vec; max_sensspec_inc = zero_vec; max_func_inc = zero_vec; cp_inc = missing; propdist = zero_vec; sens_last = zero_vec; spec_last = J(1,ncol(wts),1); %if &criterion=AUC %then %do; auc_inc = zero_vec; %end; %else %if &criterion=PARTIAL_AUC %then %do; auc_inc = zero_vec; part_auc_inc = zero_vec; %if &partial_criterion=SPEC %then %str(part_area = J(1,ncol(wts),1);); %else %str(part_area = zero_vec;); %end; max_sensspec_inc = spec_last; cp_inc = J(1,ncol(wts), peakvals[1,]); do i=1 to nobs; /* Compute sensitivity and specificity at Xi */ pos = pos + posrspns[i,]#(peakwts[i,]); neg = neg + (1-posrspns[i,])#(peakwts[i,]); sens = pos/tot_pos; spec = 1 - neg/tot_neg; /* Always need to determine cutpoint based on sensitivity and specificity */ wtd_sensspec = sens#&sens_wt + spec#&spec_wt ; if i,]; cp_assign = loc((max_sensspec_inc=wtd_sensspec)#(peakwts[i,]^=0)); if nrow(cp_assign)>0 then do; cp_inc[,cp_assign]=peakvals[i,]; end; /* If maximization function is based on AUC then compute AUC */ %if &criterion=AUC | &criterion=PARTIAL_AUC %then %do; run auc_inc; %end; end; end; else if i=nobs then do; max_sensspec_inc = (max_sensspec_inc // wtd_sensspec)[<>,]; cp_assign = loc((max_sensspec_inc=wtd_sensspec)#(peakwts[i,]^=0)); if nrow(cp_assign)>0 then do; cp_inc[,cp_assign]=peakvals[i,]; end; /* If maximization function is based on AUC then compute AUC */ %if &criterion=AUC | &criterion=PARTIAL_AUC %then %do; run auc_inc; %end; end; end; /* Initialization for computing sensitivity and specificity */ /* for prediction rule PREDi = (x>=Xi) */ pos = zero_vec; neg = zero_vec; max_sensspec_dec = zero_vec; max_func_dec = zero_vec; cp_dec = missing; sens_last = zero_vec; spec_last = J(1,ncol(wts),1); %if &criterion=AUC %then %do; auc_dec = zero_vec; %end; %else %if &criterion=PARTIAL_AUC %then %do; auc_dec = zero_vec; part_auc_dec = zero_vec; %if &partial_criterion=SPEC %then %str(part_area = J(1,ncol(wts),1);); %else %str(part_area = zero_vec;); %end; max_sensspec_dec = spec_last; cp_dec = J(1,ncol(wts), peakvals[1,]); do i=nobs to 1 by -1; /* Compute sensitivity and specificity at Xi */ pos = pos + posrspns[i,]#(peakwts[i,]); neg = neg + (1-posrspns[i,])#(peakwts[i,]); sens = pos/tot_pos; spec = 1 - neg/tot_neg; /* Always need to determine cutpoint based on sensitivity and specificity */ wtd_sensspec = sens#&sens_wt + spec#&spec_wt ; if i>1 then do; if peakvals[i,]^=peakvals[i-1,] then do; max_sensspec_dec = (max_sensspec_dec // wtd_sensspec)[<>,]; cp_assign = loc((max_sensspec_dec=wtd_sensspec)#(peakwts[i,]^=0)); if nrow(cp_assign)>0 then do; cp_dec[,cp_assign]=peakvals[i,]; end; %if &criterion=AUC | &criterion=PARTIAL_AUC %then %do; run auc_dec; %end; end; end; else if i=1 then do; max_sensspec_dec = (max_sensspec_dec // wtd_sensspec)[<>,]; cp_assign = loc((max_sensspec_dec=wtd_sensspec)#(peakwts[i,]^=0)); if nrow(cp_assign)>0 then do; cp_dec[,cp_assign]=peakvals[i,]; end; %if &criterion=AUC | &criterion=PARTIAL_AUC %then %do; run auc_dec; %end; end; end; finish; start binest; x = J(nrow(posrspns),1,1) || peakvals; do j=1 to ncol(peakwts); b = repeat(0,ncol(x),1); oldb = b+1; z = x*b; do binest_iter=1 to 20 while(max(abs(b-oldb))>1e-8); oldb=b; *print iter peak j; p = 1/(1+exp(-z)); f=p#exp(-z)#p; loglik=sum(((posrspns=1)#log(p) + (posrspns=0)#log(1-p))#peakwts[,j]); w=peakwts[,j]/(p#(1-p)); xx=f#x; xpxi=ginv(xx`*(w#xx)); b = b + xpxi*(xx`*(w#(posrspns-p))); z = x*b; min_z = z[><,]; if min_z<(-32) then do; /* Note that exp(710) cannot be computed. */ /* If there are any terms in Z which are */ /* less than -709, then the coefficient */ /* vector must be rescaled to prevent z */ /* from becoming too negative. */ lz_ltn32 = loc(z<-32); b = b/(-min_z/32); z = x*b; end; min_z = z[><,]; max_z = z[<>,]; if max_z>(32) then do; /* Note that exp(710) cannot be computed. */ /* We perform operations which are */ /* complementary to those performed above */ /* even though exp(-710) could be */ /* obtained. */ lz_gt32 = loc(z>32); b = b/(max_z/32); z = x*b; end; min_z = z[><,]; max_z = z[<>,]; end; b0[,j]=b[1,]; b1[,j]=b[2,]; loglike[,j]=loglik; end; finish; zero_vec=0; missing=.; /* Read case and control data into IML */ use __case; read all var {&vars_case} into x_case; close __case; X_CASE = t(X_case); N_case = nrow(X_case); use __control; read all var {&vars_control} into x_control; close __control; X_CONTROL = t(X_control); N_control = nrow(X_control); nobs = N_case + N_control; yx = (J(N_case,1,1) // J(N_control,1,0)) || ( x_case // x_control ); use vars_case; read all var {name} into id_case; close vars_case; use vars_control; read all var {name} into id_control; close vars_control; ids = id_case // id_control; wts = J( N_control + N_case, 1, 1); %if &wgt_anal=YES %then %do; wts = N_control/N_case * wts[1:nobs, 1]; %end; /* Construct the weighted data count */ nvec = sum(wts); ncase = (yx[,1])`*wts; ncontrol = nvec - ncase; store yx wts; free x_case x_control yx wts; %if &cv=YES %then %do; /* Initialize vector recording which of the nobs in the */ /* cross-validation data sets (combined) are in error */ cverr = J(nobs,1,0); cv_err_rate = J(1,10,0); cv_sens = cv_err_rate; cv_spec = cv_err_rate; train_err = cv_err_rate; /* Read and store all of the cross-validation data sets */ /* including the complements to the C-V data sets. Also, */ /* construct initial weights for each C-V complement. */ %do i=1 %to 10; use __case&i; read all var {&&vars__case&i} into x__case&i; close __case&i; N__case&i = ncol(X__case&i); use C_case&i; read all var {&&varsC_case&i} into xC_case&i; close C_case&i; NC_case&i = ncol(XC_case&i); use __control&i; read all var {&&vars__control&i} into x__control&i; close __control&i; N__control&i = ncol(X__control&i); use C_control&i; read all var {&&varsC_control&i} into xC_control&i; close C_control&i; NC_control&i = ncol(XC_control&i); /* Construct response and predictors for i-th CV data set */ /* as well as for the i-th CV complement */ cv_yx&i = ( J(N__case&i,1,1) // J(N__control&i,1,0) ) || ( t(X__case&i) // t(X__control&i) ); yxC&i = ( J(NC_case&i,1,1) // J(NC_control&i,1,0) ) || ( t(XC_case&i) // t(XC_control&i) ); nobs&i = NC_case&i + NC_control&i; npeaks&i = ncol(yxC&i)-1; /* Construct initial boosting weight vector */ wts&i=J( NC_control&i + NC_case&i, 1, 1); %if &wgt_anal=YES %then %do; wts&i = NC_control&i/NC_case&i * wts&i[1:nobs&i, 1]; %end; /* Construct the weighted data count */ nvec&i = sum(wts&i); ncase&i = sum(wts&i[1:NC_case&i]); ncontrol&i = nvec&i - ncase&i; /* initialize the i-th linear predictor for both the CV data set */ /* as well as for the i-th CV complement */ lp&i = J(nrow(yxC&i),1,0); cv_lp&i = J(nrow(cv_yx&i),1,0); store x__case&i xC_case&i x__control&i xC_control&i cv_yx&i yxC&i wts&i lp&i cv_lp&i nvec&i ncase&i ncontrol&i nobs&i npeaks&i; free x__case&i xC_case&i x__control&i xC_control&i cv_yx&i yxC&i wts&i lp&i cv_lp&i nvec&i ncase&i ncontrol&i nobs&i npeaks&i; %end; cv_err_last21 = J(21,3,1); cv_err_last21[,3] = (1:21)`; xpx = (cv_err_last21[,2:3])`*cv_err_last21[,2:3]; xpx_inv = solve(xpx,I(2)); increasing_7 = J(7,1,0); /* Used to terminate boosting iterations */ ntest_cv = 0; increasing_7_test = J(7,1,1); min_npairs_inc=0; %if &iters>0 %then %do; do iter=1 to &iters; %end; %else %do; do iter=1 to 250 while(max(abs(increasing_7-increasing_7_test))>1E-16); %end; cv_err_end = 0; do cv_set=1 to 10; peak_ind = 0; %if &method=DISCRETE %then %do; max_func = 0; cp = 0; direction = 0; err = 0; alpha = 0; max_sensspec=0; %end; %else %do; max_func = .; loglike = .; b0 = 0; b1 = 0; b0_mll = 0; b1_mll = 0; %end; min_marg = -1; %do i=1 %to 10; %if &i>1 %then else; if cv_set=&i then do; load cv_yx&i yxC&i wts&i lp&i cv_lp&i nvec&i ncase&i ncontrol&i nobs&i npeaks&i; cv_yx = cv_yx&i; yx = yxC&i; wts = wts&i; lp = lp&i; cv_lp = cv_lp&i; nvec = nvec&i; ncase = ncase&i; ncontrol = ncontrol&i; nobs = nobs&i; npeaks = npeaks&i; free cv_yx&i yxC&i wts&i lp&i cv_lp&i nvec&i ncase&i ncontrol&i nobs&i npeaks&i; end; %end; %if &method=REAL & &criterion=AUC %then %do; case_recs = loc(yx[,1]=1); cont_recs = loc(yx[,1]=0); wts_case = sum(wts[case_recs,1]); wts_cont = sum(wts[cont_recs,1]); %end; cv_err_start = cv_err_end+1; cv_err_end = cv_err_end + nrow(cv_set); npeaks = ncol(yx) - 1; do peak=1 to npeaks; posrspns = yx[,1]; peakvals = yx[,peak+1]; peakwts = wts; %if &method=DISCRETE %then %do; peakvals_ranks = rank(peakvals); posrspns[peakvals_ranks,] = yx[,1]; peakvals[peakvals_ranks,] = yx[,peak+1]; peakwts[peakvals_ranks,] = wts; %end; /********************************************************************/ /* */ /* Run appropriate minimization function for i-th peak. If peak */ /* maximizes the appropriate objective function, then assign peak */ /* indicators, cut point, and rule direction. The latter two are */ /* not needed for METHOD=BLR (boosting logistic regression) but are */ /* required for METHOD=BDT (boosting decision tree). */ /* */ /********************************************************************/ %if &method=DISCRETE %then %do; %if &criterion=MIN_ERR | &criterion=MINERR %then %do; run min_err; max_func = (max_func // max_correct_inc // max_correct_dec)[<>,]; cp_assign = loc(max_func=max_correct_inc); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_inc[,cp_assign]; direction[,cp_assign] = 1; peak_ind[,cp_assign] = peak; %err(direction=INCREASING) end; cp_assign = loc(max_func=max_correct_dec); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_dec[,cp_assign]; direction[,cp_assign] = -1; peak_ind[,cp_assign] = peak; %err(direction=DECREASING) end; %end; %else %if &criterion=AUC | &criterion=PARTIAL_AUC | &criterion=SENSSPEC %then %do; run auc; max_sensspec=(max_sensspec // max_sensspec_inc // max_sensspec_dec)[<>,]; %if &criterion=AUC %then %do; max_func = (max_func // auc_inc // auc_dec)[<>,]; cp_assign = loc(max_func=auc_inc); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_inc[,cp_assign]; direction[,cp_assign] = 1; peak_ind[,cp_assign] = peak; %err(direction=INCREASING) end; cp_assign = loc(max_func=auc_dec); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_dec[,cp_assign]; direction[,cp_assign] = -1; peak_ind[,cp_assign] = peak; %err(direction=DECREASING) end; %end; %else %if &criterion=PARTIAL_AUC %then %do; max_func = (max_func // part_auc_inc // part_auc_dec)[<>,]; cp_assign = loc(max_func=part_auc_inc); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_inc[,cp_assign]; direction[,cp_assign] = 1; peak_ind[,cp_assign] = peak; %err(direction=INCREASING) end; cp_assign = loc(max_func=part_auc_dec); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_dec[,cp_assign]; direction[,cp_assign] = -1; peak_ind[,cp_assign] = peak; %err(direction=DECREASING) end; %end; %else %do; max_func = (max_func // max_sensspec_inc // max_sensspec_dec)[<>,]; cp_assign = loc(max_func=max_sensspec_inc); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_inc[,cp_assign]; direction[,cp_assign] = 1; peak_ind[,cp_assign] = peak; %err(direction=INCREASING) end; cp_assign = loc(max_func=max_sensspec_dec); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_dec[,cp_assign]; direction[,cp_assign] = -1; peak_ind[,cp_assign] = peak; %err(direction=DECREASING) end; %end; %end; %end; %else %do; run binest; %if &criterion=LOG_LIKE %then %do; max_func = (max_func // loglike)[<>,]; nll_assign = (max_func = loglike); %end; %else %do; lp = b0 + b1*yx[,peak+1]; prob = exp(lp) / (1 + exp(lp)); prob_sort = prob; wts_sort = wts; yx_sort = yx; lp_sort = lp; sort_seq = rank(-prob); prob_sort[sort_seq,] = prob; wts_sort[sort_seq,] = wts; yx_sort[sort_seq,] = yx[,]; lp_sort[sort_seq,] = lp; sens = 0; _1mspec = 0; p_sens=0; p_1mspec=0; p_lp=.; auc=0; event_case = 0; event_control = 0; do i=1 to nrow(lp); if yx_sort[i,1]=1 then do; event_case = event_case + wts_sort[i,]; sens = sens + wts_sort[i,]/wts_case; end; else do; event_control = event_control + wts_sort[i,]; _1mspec = _1mspec + wts_sort[i,]/wts_cont; end; if lp_sort[i,]^=p_lp then do; if _1mspec^=p_1mspec then auc = auc + ((sens + p_sens)/2) * (_1mspec - p_1mspec); p_lp = lp[i,]; p_sens = sens; p_1mspec = _1mspec; end; end; max_func = (max_func // auc)[<>,]; nll_assign = (max_func = auc); %end; if nll_assign>0 then do; %if &criterion=LOG_LIKE %then %do; ll_assign = loc(max_func = loglike); %end; %else %do; ll_assign = loc(max_func = auc); %end; peak_ind[,ll_assign] = peak; b0_mll[,ll_assign] = b0[,ll_assign]; b1_mll[,ll_assign] = b1[,ll_assign]; end; %end; end; /* do peak=1 to npeaks; */ %if &method=DISCRETE %then %do; err_min = 1/(2*nobs); err = (err // err_min)[<>,]; alpha = log((1-err)/err); if direction=1 then predict=-1 + 2*(yx[,peak_ind+1]<=cp); else predict=-1 + 2*(yx[,peak_ind+1]>=cp); lp = lp + alpha*predict; if direction=1 then pred_cv = -1 + 2*(cv_yx[,peak_ind+1]<=cp); else pred_cv = -1 + 2*(cv_yx[,peak_ind+1]>=cp); cv_lp = cv_lp + alpha*pred_cv; %if &wgt_anal=YES %then %do; err_wts=(wts>0); err_wts[loc(yx[,1]=1),]=n_control/n_case; train_err[,cv_set] = (yx[,1]^=(lp>0))`*err_wts/err_wts[+,]; cv_wts = J(nrow(cv_yx),1,1); cv_wts[loc(cv_yx[,1]=1),] = cv_wts[loc(cv_yx[,1]=1),]*n_control/n_case; cverr = (cv_yx[,1] ^= (cv_lp>0) ); cv_err_rate[,cv_set] = cverr`*cv_wts/sum(cv_wts); cv_sens[,cv_set] = ((cv_yx[,1]=1)#(cv_yx[,1]=(cv_lp>0)))`*(cv_yx[,1]=1) / sum(cv_yx[,1]=1); cv_spec[,cv_set] = ((cv_yx[,1]=0)#(cv_yx[,1]=(cv_lp>0)))`*(cv_yx[,1]=0) / sum(cv_yx[,1]=0); %end; %else %do; train_err[,cv_set] = (yx[,1]^=(lp>0))`*(wts>0)/nobs; cverr = ( cv_yx[,1] ^= (cv_lp>0) ); cv_err_rate[,cv_set] = cverr[+,] / nrow(cverr); cv_sens[,cv_set] = ((cv_yx[,1]=1)#(cv_yx[,1]=(cv_lp>0)))`*(cv_yx[,1]=1) / sum(cv_yx[,1]=1); cv_spec[,cv_set] = ((cv_yx[,1]=0)#(cv_yx[,1]=(cv_lp>0)))`*(cv_yx[,1]=0) / sum(cv_yx[,1]=0); %end; if err^=err_min then do; k = nvec / (nvec * (1 - err) + nvec*exp(alpha)*err); wts = k*wts#exp(alpha*((-1+2*yx[,1])^=predict)); end; else do; wts = wts#exp(alpha*((-1+2*yx[,1])^=predict)); k = nvec / wts[+,]; wts = k*wts; end; min_marg = (lp # (-1+2*yx[,1]))[><,]; %end; %else %do; eta = b0_mll + b1_mll * yx[,peak_ind+1]; phat = 1 / (1 + exp(-eta)); predict = 0.5 * log(phat/(1-phat)); lp = lp + predict; cv_eta = b0_mll + b1_mll * cv_yx[,peak_ind+1]; cv_phat = 1 / (1 + exp(-cv_eta)); if cv_phat=0 then cv_phat=1E-10; else if cv_phat=1 then cv_phat=1-1E-10; cv_predict = 0.5 * log(cv_phat/(1-cv_phat)); cv_lp = cv_lp + cv_predict; %if &wgt_anal=YES %then %do; err_wts=(wts>0); err_wts[loc(yx[,1]=1),]=n_control/n_case; train_err[,cv_set] = (yx[,1]^=(lp>0))`*err_wts/err_wts[+,]; cv_wts = J(nrow(cv_yx),1,1); cv_wts[loc(cv_yx[,1]=1),] = cv_wts[loc(cv_yx[,1]=1),]*n_control/n_case; cverr = (cv_yx[,1] ^= (cv_lp>0) ); cv_err_rate[,cv_set] = cverr`*cv_wts/sum(cv_wts); cv_sens[,cv_set] = ((cv_yx[,1]=1)#(cv_yx[,1]=(cv_lp>0)))`*(cv_yx[,1]=1) / sum(cv_yx[,1]=1); cv_spec[,cv_set] = ((cv_yx[,1]=0)#(cv_yx[,1]=(cv_lp>0)))`*(cv_yx[,1]=0) / sum(cv_yx[,1]=0); %end; %else %do; train_err[,cv_set] = (yx[,1]^=(lp>0))`*(wts>0)/nobs; cverr = ( cv_yx[,1] ^= (cv_lp>0) ); cv_err_rate[,cv_set] = cverr[+,] / nrow(cverr); cv_cases = loc(cv_yx[,1]=1); cv_controls = loc(cv_yx[,1]=0); cv_sens[,cv_set] = (cv_yx[cv_cases,1])`*(cv_lp[cv_cases,1]>0) / ncol(cv_cases); cv_spec[,cv_set] = (1-cv_yx[cv_controls,1])`*(cv_lp[cv_controls,1]<=0) / ncol(cv_controls); cv_sens[,cv_set] = ((cv_yx[,1]=1)#(cv_yx[,1]=(cv_lp>0)))`*(cv_yx[,1]=1) / sum(cv_yx[,1]=1); cv_spec[,cv_set] = ((cv_yx[,1]=0)#(cv_yx[,1]=(cv_lp>0)))`*(cv_yx[,1]=0) / sum(cv_yx[,1]=0); %end; wts = exp(-(-1 + 2*(yx[,1]))#predict)#wts; k = nvec / wts[+,]; wts = k * wts; %end; %do i=1 %to 10; %if &i>1 %then else; if cv_set=&i then do; cv_yx&i = cv_yx ; yxC&i = yx ; wts&i = wts ; lp&i = lp ; cv_lp&i = cv_lp ; nvec&i = nvec ; ncase&i = ncase ; ncontrol&i = ncontrol; nobs&i = nobs; npeaks&i = npeaks; store cv_yx&i yxC&i wts&i lp&i cv_lp&i nvec&i ncase&i ncontrol&i nobs&i npeaks&i; end; %end; end; if iter=1 then do; cv_err_out = cv_err_rate; cv_sens_out = cv_sens; cv_spec_out = cv_spec; train_err_out = train_err; end; else do; cv_err_out = cv_err_out // cv_err_rate; cv_sens_out = cv_sens_out // cv_sens; cv_spec_out = cv_spec_out // cv_spec; train_err_out = train_err_out // train_err; * min_marg_out = min_marg_out // min_marg; end; if iter<=21 then cv_err_last21[iter,1] = cv_err_rate[,:]; else cv_err_last21[,1] = cv_err_last21[2:21,1] // cv_err_rate[,:]; if iter>=21 then do; xpy = (cv_err_last21[,2:3])`*cv_err_last21[,1]; increasing = ((xpx_inv*xpy)[2,]>0); ntest_cv = ntest_cv + 1; if ntest_cv<=7 then increasing_7[ntest_cv,]=increasing; else increasing_7 = increasing_7[2:7,] // increasing; end; %if &cverr_out^=%str() %then %do; if iter=1 then cverr_out=cverr; else cverr_out=cverr_out || cverr; %end; end; %if &iters>0 %then %do; cv_err_rate = cv_err_out[,:]; cv_sens = cv_sens_out[,:]; cv_spec = cv_spec_out[,:]; niter=&iters; %end; %else %do; cv_err_marg = cv_err_out[,:]; cv_sens_marg = cv_sens_out[,:]; cv_spec_marg = cv_spec_out[,:]; min_cv_err = cv_err_marg[><,]; lmin_cv_err = loc(cv_err_marg=min_cv_err); mean_train_err = (train_err_out[lmin_cv_err,])[,:]; min_train_err = mean_train_err[><,1]; lmin_train_err = loc(mean_train_err[,1]=min_train_err); niter = lmin_cv_err[,lmin_train_err]; niter = niter[,><]; cv_err_rate = cv_err_marg[1:niter,]; cv_sens = cv_sens_marg[1:niter,]; cv_spec = cv_spec_marg[1:niter,]; %end; %if &cverr_out^=%str() %then %do; case = yx[,1]; predict=case; case_loc = loc(case); predict[case_loc,] = 1 - cverr_out[case_loc,niter]; cont_loc = loc(1-case); predict[cont_loc,] = cverr_out[cont_loc,niter]; /* case_errloc = loc((predict=1) # (cverr_out[,niter]=1)); predict[case_errloc,]=0; cont_errloc = loc((predict=0) # (cverr_out[,niter]=1)); predict[cont_errloc,]=1; */ cv_subset = ({1 2 3 4 5 6 7 8 9 10}#(wts=0))[,+]; colname={"Case" "Predict" "CV_errors" "CV_subset"}; cverr = yx[,1] || predict || cverr_out[,niter] || cv_subset; create &cverr_out from cverr[colname=colname rowname=ID]; append from cverr[rowname=ID]; %end; %end; /* end of cross-validation iteration */ %else %do; niter = &iters; %end; /* Now fit the same ADAboost model employing all of the data stopping */ /* at the iteration determined in the cross-validation step. The */ /* only real difference between this and the previous section is that */ /* in this section the weight matrix is a column vector rather than */ /* a matrix with 10 columns. We compute only one function per */ /* iteration, and stop at the iteration determined in the previous */ /* cross-validation step. */ load yx; N_case = sum(yx[,1]=1); N_control = sum(yx[,1]=0); nobs = N_case + N_control; npeaks = ncol(yx)-1; wts = J(nobs,1,1); case_obs = loc(yx[,1]=1); %if &wgt_anal=YES %then %do; wts[case_obs,]=wts[case_obs,]*n_control/n_case; %end; nvec = wts[+,]; ncase = (yx[,1])`*wts; ncontrol = nvec - ncase; zero_vec = 0; missing = .; lp = J(nobs,1,0); /* linear predictor sum(alpha{m}*f{m}(x)) */ do iter=1 to niter; peak_ind = zero_vec; %if &method=DISCRETE %then %do; max_func = zero_vec; cp = zero_vec; direction = zero_vec; err = zero_vec; alpha = zero_vec; max_sensspec = zero_vec; %end; %else %do; max_func = missing;; loglike = missing; b0 = zero_vec; b1 = zero_vec; b0_mll = zero_vec; b1_mll = zero_vec; %end; min_marg = J(1,1,-1); train_err = J(1,1,1); %if &method=REAL & &criterion=AUC %then %do; case_recs = loc(yx[,1]=1); cont_recs = loc(yx[,1]=0); wts_case = sum(wts[case_recs,1]); wts_cont = sum(wts[cont_recs,1]); %end; do peak=1 to npeaks; posrspns = yx[,1]; peakvals = yx[,peak+1]; peakwts = wts; %if &method=DISCRETE %then %do; peakvals_ranks = rank(peakvals); posrspns[peakvals_ranks,] = yx[,1]; peakvals[peakvals_ranks,] = yx[,peak+1]; peakwts[peakvals_ranks,] = wts; %end; /********************************************************************/ /* */ /* Run appropriate minimization function for i-th peak. If peak */ /* maximizes the appropriate objective function, then assign peak */ /* indicators, cut point, and rule direction. The latter two are */ /* not needed for METHOD=BLR (boosting logistic regression) but are */ /* required for METHOD=BDT (boosting decision tree). */ /* */ /********************************************************************/ %if &method=DISCRETE %then %do; %if &criterion=MIN_ERR | &criterion=MINERR %then %do; run min_err; max_func = (max_func // max_correct_inc // max_correct_dec)[<>,]; cp_assign = loc(max_func=max_correct_inc); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_inc[,cp_assign]; direction[,cp_assign] = 1; peak_ind[,cp_assign] = peak; %err(direction=INCREASING) end; cp_assign = loc(max_func=max_correct_dec); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_dec[,cp_assign]; direction[,cp_assign] = -1; peak_ind[,cp_assign] = peak; %err(direction=DECREASING) end; %end; %else %if &criterion=AUC | &criterion=PARTIAL_AUC | &criterion=SENSSPEC %then %do; run auc; max_sensspec=(max_sensspec // max_sensspec_inc // max_sensspec_dec)[<>,]; %if &criterion=AUC %then %do; max_func = (max_func // auc_inc // auc_dec)[<>,]; cp_assign = loc(max_func=auc_inc); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_inc[,cp_assign]; direction[,cp_assign] = 1; peak_ind[,cp_assign] = peak; %err(direction=INCREASING) end; cp_assign = loc(max_func=auc_dec); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_dec[,cp_assign]; direction[,cp_assign] = -1; peak_ind[,cp_assign] = peak; %err(direction=DECREASING) end; %end; %else %if &criterion=PARTIAL_AUC %then %do; max_func = (max_func // part_auc_inc // part_auc_dec)[<>,]; cp_assign = loc(max_func=part_auc_inc); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_inc[,cp_assign]; direction[,cp_assign] = 1; peak_ind[,cp_assign] = peak; %err(direction=INCREASING) end; cp_assign = loc(max_func=part_auc_dec); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_dec[,cp_assign]; direction[,cp_assign] = -1; peak_ind[,cp_assign] = peak; %err(direction=DECREASING) end; %end; %else %do; max_func = (max_func // max_sensspec_inc // max_sensspec_dec)[<>,]; cp_assign = loc(max_func=max_sensspec_inc); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_inc[,cp_assign]; direction[,cp_assign] = 1; peak_ind[,cp_assign] = peak; %err(direction=INCREASING) end; cp_assign = loc(max_func=max_sensspec_dec); if ncol(cp_assign)>0 then do; cp[,cp_assign] = cp_dec[,cp_assign]; direction[,cp_assign] = -1; peak_ind[,cp_assign] = peak; %err(direction=DECREASING) end; %end; %end; %end; %else %do; run binest; %if &criterion=LOG_LIKE %then %do; max_func = (max_func // loglike)[<>,]; nll_assign = (max_func = loglike); %end; %else %do; eta = b0 + b1*yx[,peak+1]; prob = exp(eta) / (1 + exp(eta)); prob_sort = prob; wts_sort = wts; yx_sort = yx; eta_sort = eta; sort_seq = rank(-prob); prob_sort[sort_seq,] = prob; wts_sort[sort_seq,] = wts; yx_sort[sort_seq,] = yx[,]; eta_sort[sort_seq,] = eta; sens = 0; _1mspec = 0; p_sens=0; p_1mspec=0; p_eta=.; auc=0; event_case = 0; event_control = 0; do i=1 to nrow(eta); if yx_sort[i,1]=1 then do; event_case = event_case + wts_sort[i,]; sens = sens + wts_sort[i,]/wts_case; end; else do; event_control = event_control + wts_sort[i,]; _1mspec = _1mspec + wts_sort[i,]/wts_cont; end; if eta_sort[i,]^=p_eta then do; if _1mspec^=p_1mspec then auc = auc + ((sens + p_sens)/2) * (_1mspec - p_1mspec); p_eta = eta[i,]; p_sens = sens; p_1mspec = _1mspec; end; end; max_func = (max_func // auc)[<>,]; nll_assign = (max_func = auc); %end; if nll_assign>0 then do; %if &criterion=LOG_LIKE %then %do; ll_assign = loc(max_func = loglike); %end; %else %do; ll_assign = loc(max_func = auc); %end; peak_ind[,ll_assign] = peak; b0_mll[,ll_assign] = b0[,ll_assign]; b1_mll[,ll_assign] = b1[,ll_assign]; end; %end; end; /* do peak=1 to npeaks; */ %if &method=DISCRETE %then %do; err_min = 1/(2*nobs); err = (err // err_min)[<>,]; alpha = log((1-err)/err); if direction=1 then predict=-1 + 2*(yx[,peak_ind+1]<=cp); else predict=-1 + 2*(yx[,peak_ind+1]>=cp); lp = lp + alpha*predict; %if &wgt_anal=YES %then %do; err_wts=(wts>0); err_wts[loc(yx[,1]=1),]=n_control/n_case; train_err = (yx[,1]^=(lp[,j]>0))`*err_wts/err_wts[+,]; %end; %else %do; train_err = (yx[,1]^=(lp>0))`*(wts>0)/nobs; %end; sensitivity = 1 - ((yx[,1]=1)#(yx[,1]^=(lp>0)))`*(wts>0)/((yx[,1]=1)[+,]); specificity = 1 - ((yx[,1]=0)#(yx[,1]^=(lp>0)))`*(wts>0)/((yx[,1]=0)[+,]); if err^=err_min then do; k = nobs/(nobs*(1 - err) + nobs*exp(alpha)*err); wts = k*wts#exp(alpha*((-1+2*yx[,1])^=predict)); end; else do; wts = wts#exp(alpha*((-1+2*yx[,1])^=predict)); k = nobs/wts[+,]; wts = k*wts; end; min_marg = (lp#(-1+2*yx[,1])#(wts>0))[><,]; %end; %else %do; eta = b0_mll + b1_mll * yx[,peak_ind+1]; phat = 1 / (1 + exp(-eta)); predict = 0.5 * log(phat/(1-phat)); lp = lp + predict; %if &wgt_anal=YES %then %do; err_wts=(wts>0); err_wts[loc(yx[,1]=1),]=n_control/n_case; train_err = (yx[,1]^=(lp>0))`*err_wts/err_wts[+,]; %end; %else %do; train_err = (yx[,1]^=(lp>0))`*(wts>0)/nobs; %end; sensitivity = 1 - ((yx[,1]=1)#(yx[,1]^=(lp>0)))`*(wts>0)/((yx[,1]=1)[+,]); specificity = 1 - ((yx[,1]=0)#(yx[,1]^=(lp>0)))`*(wts>0)/((yx[,1]=0)[+,]); wts = exp(-(-1 + 2*(yx[,1]))#predict)#wts; k = nobs/wts[+,]; wts = k*wts; %end; if iter=1 then do; max_func_out = max_func; peak_ind_out = peak_ind; train_err_out = train_err; sensitivity_out = sensitivity; specificity_out = specificity; %if &method=DISCRETE %then %do; cp_out = cp; direction_out = direction; err_out = err; alpha_out = alpha; %end; %else %do; b0_mll_out = b0_mll; b1_mll_out = b1_mll; %end; end; else do; max_func_out = max_func_out // max_func; peak_ind_out = peak_ind_out // peak_ind; train_err_out = train_err_out // train_err; sensitivity_out = sensitivity_out // sensitivity; specificity_out = specificity_out // specificity; %if &method=DISCRETE %then %do; cp_out = cp_out // cp; direction_out = direction_out // direction; err_out = err_out // err; alpha_out = alpha_out // alpha; %end; %else %do; b0_mll_out = b0_mll_out // b0_mll; b1_mll_out = b1_mll_out // b1_mll; %end; end; end; %if &method=DISCRETE %then %do; colname="Func_max" || "Peak_ind" || "Train_err" || "Train_Sens" || "Train_Spec" || "Cut_point" || "Direction" || "Alpha" %if &cv^=NO %then || "CV_err" || "CV_Sens" || "CV_spec"; ; result = max_func_out || peak_ind_out || train_err_out || sensitivity_out || specificity_out || cp_out || direction_out || alpha_out %if &cv^=NO %then || cv_err_rate || cv_sens || cv_spec; ; %end; %else %do; colname = "Func_max" || "Peak_ind" || "Train_err" || "Train_Sens" || "Train_Spec" || "B0" || "B1" %if &cv^=NO %then || "CV_err" || "CV_Sens" || "CV_spec"; ; result = max_func_out || peak_ind_out || train_err_out || sensitivity_out || specificity_out || b0_mll_out || b1_mll_out %if &cv^=NO %then || cv_err_rate || cv_sens || cv_spec; ; %end; %if &linpred^=%str() %then %do; create &linpred from result[colname=colname]; append from result; %end; %if &case_score^=%str() | &control_score^=%str() %then %do; wts_case = wts[case_obs,]; wts_control = wts[1:ncontrol,]; %if &case_score^=%str() %then %do; create case_wts from wts_case[colname={"Weight"}]; append from wts_case; %end; %if &control_score^=%str() %then %do; create control_wts from wts_control[colname={"Weight"}]; append from wts_control; %end; %end; *print wts; quit; ods listing; %if &linpred^=%str() %then %do; proc format; value direcshn 1 = 'X<=critval' -1 = 'X>=critval'; run; proc contents data=__case(keep=&firstvar) noprint out=v1_chars; run; data _null_; set v1_chars; if type=1 then call symput("type", " "); else call symput("type", "$"); call symput("length", left(put(length,5.))); run; data &linpred; retain &firstvar &type &length; set &linpred end=lastrec; pointer=peak_ind; set __case(keep=&firstvar) point=pointer; %if &method=DISCRETE %then %do; format direction direcshn.; %end; if lastrec then call symput("n_iters", put(_n_,5.-L)); drop peak_ind; run; %if &print^=NO %then %do; proc print data=&linpred; run; %end; %end; %if &case_score^=%str() & &linpred^=%str() %then %do; proc sort data=__case; by &firstvar; run; proc sort data=&linpred out=lp_sort; by &firstvar; run; data score(rename=(%do i=1 %to %eval(&ncase_names-1); lp&&case_name&i=&&case_name&i %end;)); merge __case(in=a) lp_sort(in=b); by &firstvar; if b; nrecs+1; /* array score_vars {%eval(&ncase_names-1)} %do i=1 %to %eval(&ncase_names-1); m&&case_name&i %end; ;*/ array score_vars {%eval(&ncase_names-1)} %do i=1 %to %eval(&ncase_names-1); &&case_name&i %end; ; array scored_vars {%eval(&ncase_names-1)} %do i=1 %to %eval(&ncase_names-1); lp&&case_name&i %end; ; do i=1 to %eval(&ncase_names-1); %if &method=DISCRETE %then %do; if direction=1 then scored_vars{i} + alpha*(-1 + 2*(score_vars{i}<=cut_point)); else scored_vars{i} + alpha*(-1 + 2*(score_vars{i}>=cut_point)); %end; %else %do; scored_vars{i} + b0 + b1*score_vars{i}; %end; end; %let NVars=%eval(&Ncase_names-1); keep lp&case_name1--lp&&case_name&Nvars..; if nrecs=&n_iters then output; run; proc transpose data=score out=&case_score(rename=(_name_=ID col1=LinPred)); quit; data &case_score; merge &case_score case_wts; run; %end; %if &control_score^=%str() & &linpred^=%str() %then %do; proc sort data=__control; by &firstvar; run; proc sort data=&linpred out=lp_sort; by &firstvar; run; data score(rename=(%do i=1 %to %eval(&ncontrol_names-1); lp&&cont_name&i=&&cont_name&i %end;)); merge __control(in=a) lp_sort(in=b); by &firstvar; if b; nrecs+1; /* array score_vars {%eval(&ncont_names-1)} %do i=1 %to %eval(&ncont_names-1); m&&cont_name&i %end; ;*/ array score_vars {%eval(&ncontrol_names-1)} %do i=1 %to %eval(&ncontrol_names-1); &&cont_name&i %end; ; array scored_vars {%eval(&ncontrol_names-1)} %do i=1 %to %eval(&ncontrol_names-1); lp&&cont_name&i %end; ; do i=1 to %eval(&ncontrol_names-1); %if &method=DISCRETE %then %do; if direction=1 then scored_vars{i} + alpha*(-1 + 2*(score_vars{i}<=cut_point)); else scored_vars{i} + alpha*(-1 + 2*(score_vars{i}>=cut_point)); %end; %else %do; scored_vars{i} + b0 + b1*score_vars{i}; %end; end; %let NVars=%eval(&Ncontrol_names-1); keep lp&cont_name1--lp&&cont_name&Nvars..; if nrecs=&n_iters then output; run; proc transpose data=score out=&control_score(rename=(_name_=ID col1=LinPred)); quit; data &control_score; merge &control_score control_wts; run; %end; %if &test^=%str() & &test_score^=%str() & &linpred^=%str() %then %do; proc sort data=__test; by &firstvar; run; proc sort data=&linpred out=lp_sort; by &firstvar; run; data score(rename=(%do i=1 %to %eval(&ntest_names-1); lp&&test_name&i=&&test_name&i %end;)); merge __test(in=a) lp_sort(in=b); by &firstvar; if b; nrecs+1; /* array score_vars {%eval(&ntest_names-1)} %do i=1 %to %eval(&ntest_names-1); m&&test_name&i %end; ;*/ array score_vars {%eval(&ntest_names-1)} %do i=1 %to %eval(&ntest_names-1); &&test_name&i %end; ; array scored_vars {%eval(&ntest_names-1)} %do i=1 %to %eval(&ntest_names-1); lp&&test_name&i %end; ; do i=1 to %eval(&ntest_names-1); %if &method=DISCRETE %then %do; if direction=1 then scored_vars{i} + alpha*(-1 + 2*(score_vars{i}<=cut_point)); else scored_vars{i} + alpha*(-1 + 2*(score_vars{i}>=cut_point)); %end; %else %do; scored_vars{i} + b0 + b1*score_vars{i}; %end; end; %let NVars=%eval(&Ntest_names-1); keep lp&test_name1--lp&&test_name&NVars..; if nrecs=&n_iters then output; run; proc transpose data=score out=&test_score(rename=(_name_=ID col1=LinPred)); quit; %end; %end; /* %if &boost=YES %then %do; */ %stop_mac:; %mend;