################################ #This script reproduces the results of the paper. There are #three sections. We first reproduce the inference performed #and the sensitivity analysis for 1-naphthol and 2-naphthol. #We then recreate the match described in the paper. Finally, #we give code for conducting the simulation studies of section 5. ################################ #Note: GUROBI 6.5.0 is required to reproduce the results of this paper. The functions use quadratically constrained programming, which #is directly supported for R as of Gurobi 6.5.0. Previous versions of Gurobi allowed for quadratically constrained programming #in R by converting the constraints to second order cone constraints. ##You will need to download Gurobi 6.5.0 or higher and install its R package. Instructions are available at www.gurobi.com; Gurobi is free for academics. #You will also need the following R packages: # slam, Matrix, MASS, optmatch, as well as their dependencies ####First, download the code and place everything in the working #directory #load the data set and the required functions dat = read.csv("naphthalene.csv") source("multiCompareFunctions.R") #treatment vector treatment = dat[,1] #indices resulting from full match. Individuals in the same matched set have the same index number index = dat[,2] #the responses, log 1-naphthol and 2-naphthol concentrations lnaph = log(dat[,3:4]) #the covariates covariates = dat[,-(1:4)] ########################### #1) Inference and sensitivity analysis ###################### #Get HL estimates for multiplicative effect (additive on log scale) HL = rep(0,2) for(aa in 1:2) { outcome = lnaph[,aa] taugrid=seq(1, 3, .01); pvalgrid=rep(0,length(taugrid)); for(i in 1:length(taugrid)){ adjusted.response=outcome-treatment*taugrid[i]; pvalgrid[i]=alignedranktest(adjusted.response,index,treatment); } HL[aa] = taugrid[which.max(pvalgrid)] } ###to get the multiplicative factors in the paper, take exp{} of answer estimates = exp(HL) #Now for confidence intervals: cibound.func=function(treatment.effect,outcome,matchedset,treated,alternative="greater"){ adjusted.outcome=outcome-treatment.effect*treated; alignedranktest(adjusted.outcome,matchedset,treated, alternative)-.025; } CI = matrix(0,2,2) #invert a series of tests to get the confidence intervals on log scale, then exponentiate for(aa in 1:2) { # Confidence bounds lci.bound=uniroot(cibound.func,c(-2,5),lnaph[,aa],index, treatment, alternative="greater") uci.bound=uniroot(cibound.func,c(-2,5),lnaph[,aa],index, treatment, alternative="less") l = lci.bound\$root u = uci.bound\$root CI[aa,] = exp(c(l, u)) } CI ################ #Now, the sensitivity analysis. We use the function multipleComparisons for all of these. It requires the gurobi optimization suite, which is freely available for academics ############### #First, define the aligned ranks: PO = matrix(0, nrow(lnaph), ncol(lnaph)) nostratum = length(unique(index)) for(i in 1:ncol(PO)) { alignedval = rep(0, nostratum) out = lnaph[,i] for(j in 1:nostratum) { ind = which(index == j) alignedval[ind] = out[ind] - mean(out[ind]) } ranks = rank(alignedval) PO[,i] = ranks } ##The uncorrected maximal Gammas G105 = uniroot(multipleComparisonsRoot, c(1, 9), index, PO[,1], treatment, alpha = 0.05, alternative = "G")\$root G205 = uniroot(multipleComparisonsRoot, c(1, 9), index, PO[,2], treatment, alpha = 0.05, alternative = "G")\$root #Now correcting G1025 = uniroot(multipleComparisonsRoot, c(1, 9), index, PO[,1], treatment, alpha = 0.025, alternative = "G")\$root G2025 = uniroot(multipleComparisonsRoot, c(1, 9), index, PO[,2], treatment, alpha = 0.025, alternative = "G")\$root #Now finding the maximal Gamma for overall null through the minimax procedure G12 = uniroot(multipleComparisonsRoot, c(1, 11), index, PO, treatment, alpha = 0.05, alternative = "G")\$root #Now, let's show the results in Table 3 res1 = multipleComparisons(index, PO[,1], treatment, alternative = "G", Gamma = 10) res2 = multipleComparisons(index, PO[,2], treatment, alternative = "G", Gamma = 10) res12 = multipleComparisons(index, PO, treatment, alternative = "G", Gamma = 10) #u gives the worst case confounder, and rho gives the conditional probabilities under the worst case u at the Gamma being tested rho_{ij} = (exp(Gamma*u_{ij})/sum_{i=1}^{n_i}(exp(Gamma*u_{ij})) rho1pair = res1\$rho[index==348] u1pair = res1\$u[index==348] rho2pair = res2\$rho[index==348] u2pair = res2\$u[index==348] POpair = PO[index==348,] rho12pair = res12\$rho[index==348] u12pair = res12\$u[index==348] #worst case U's u1pair u2pair u12pair #worst expectations - individ sum(rho1pair*POpair[,1]) sum(rho2pair*POpair[,2]) #worst expectations - minimax sum(rho12pair*POpair[,1]) sum(rho12pair*POpair[,2]) ################ #2) The following code returns the indices found in column 2 of the data set naphthalene.csv by conducting a full match. It also allows the user to create Figure 1 of the manuscript ################ ########### library(optmatch) #Get the covaraites from the data set covariates = dat[,-(1:4)] X = covariates names.vars = colnames(X) dat.new = as.data.frame(X) dat.new.miss = dat.new names.miss = c() missing.indicators = c() ####Make the indicators for missing values, and fill in the overall mean for the NA's. This does not affect the propensity score fits for the variables themselves, as discussed in the references in this part of the manuscript MISS1 = matrix(0, nrow(dat.new), ncol(dat.new)) for(i in 1:ncol(dat.new)) { MISS1[,i] = is.na(dat.new[,i])*1 if(any(is.na(dat.new[,i]))) { miss = is.na(dat.new[,i])*1 temp = dat.new[,i] temp[is.na(dat.new[,i])] = mean(dat.new[,i], na.rm = T) dat.new.miss[,i] = temp missing.indicators = cbind(missing.indicators, miss) names.miss = c(names.miss, paste(names.vars[i], "MISS", sep = "")) } } #Since we are treating the missingness indicators as covariates, we also include a block of ``missingness indicators" for the missingness indicators (all zeroes of course). This gets used down the line for calculating standardized differences MISS1a = cbind(MISS1, matrix(0, nrow(MISS1), ncol(missing.indicators))) treat1 = 1*treatment colnames(missing.indicators) = names.miss dat.temp= cbind(dat.new.miss, missing.indicators)#[!row.remove,] dat.final = lm(treat1~., data = dat.temp, x=T)\$x[,-1] dat.final = as.data.frame(dat.final) model = glm(treat1~., data = dat.final, family = binomial, x = T) #remove the race category "other" to get the final covariate matrix for matching (for singularity reasons). Also remove the intercept X = model\$x[,-c(1,23)] logit.propscore1 = predict(model) logit.propscore = logit.propscore1 propscore = exp(logit.propscore1)/(1+exp(logit.propscore1)) subject.index=seq(1,length(treat1),1) rownames(X) = subject.index #Make the rank based mahalanobis distances distmat = smahal(treat1, X) #Add the propensity score caliper distmat2 = addcaliper(distmat, treat1, logit.propscore, calipersd = .08) rownames(distmat2)=subject.index[treat1==1] colnames(distmat2)=subject.index[treat1==0] treated=(treat1==1) #conduct the full match matchvec=fullmatch(distmat2,min.controls = 1/9, max.controls = 9) ############This code strips the resulting match vector, defines indices for the matched sets, and associates the proper indices with the corresponding row in the original dataset matchedset.index=substr(matchvec,start=3,stop=10) matchedset.index.numeric=as.numeric(matchedset.index) subjects.match.order=as.numeric(names(matchvec)) # Create a numeric variable for which stratum each unit belongs to # 0 denotes that the unit was not matched stratum.short=substr(matchvec,start=3,stop=10); stratum.numeric=as.numeric(stratum.short); # Reassign numbers to each stratum that go from 1,..., no. of stratum sort.unique.stratum=sort(unique(stratum.numeric)); stratum.myindex.matchvecorder=rep(0,length(stratum.numeric)); for(i in 1:length(sort.unique.stratum)){ stratum.myindex.matchvecorder[stratum.numeric==sort.unique.stratum[i]]=i; } ##makes stratum.myindex in the proper order stratum.myindex=rep(0,length(stratum.myindex.matchvecorder)) stratum.myindex[subjects.match.order]=stratum.myindex.matchvecorder ####check that this aligns with index used for inference in the first bit of this code all(stratum.myindex==dat[,2]) #add "other" back into the X matrix (it got removed to avoid singularity issues) Xfinal = cbind(X[,1:21], covariates[,22], X[,22:31]) MISSfinal = MISS1a std.diff.before=rep(0,ncol(Xfinal)); std.diff.after=rep(0,ncol(Xfinal)); names(std.diff.before)=colnames(Xfinal); names(std.diff.after)=colnames(Xfinal); #define the standardized differences for(i in 1:ncol(Xfinal)){ temp.stand.diff=standardized.diff.func(Xfinal[,i],treatment,stratum.myindex, MISSfinal[,i]); std.diff.before[i]=temp.stand.diff\$std.diff.before.matching; std.diff.after[i]=temp.stand.diff\$std.diff.after.matching; print(i) } #Check the max absolute standardized difference. It is quite low (below 0.1), so we declare the match successful in reducing overt biases max(abs(std.diff.after)) covname = c("Age", "Gender", "Poverty:Income Ratio (PIR)", "Education", "Weight", "Height", "Recreational Exercise", "Regular Walking", "Moderate Workplace Exertion", "Drinks per Day", "Any Drinks this Year?", "Urinary Creatine", "Mineral Dust Exposure", "Organic Dust Exposure", "Exhaust Fume Exposure", "Other Fume Exposure", "Charred Meat Consumption", "Mexican American", "Other Hispanic", "White", "Black", "Other Race", "PIR MISS", "Weight MISS", "Height MISS", "Drinks per Day MISS", "Any Drinks this Year? MISS", "Mineral Dust MISS", "Organic Dust MISS", "Exhaust Fumes MISS", "Other Fumes MISS", "Charred Meats MISS") #####Make the balance plot (Figure 1 of the manuscript) plotBalancesign(std.diff.before, std.diff.after, covname, titleOfPlot = "Standardized Differences Before and After Matching" ) ############## # 3) Simulation Study ################ #1: Five outcomes, look at overall null library(MASS) nsim = 10000 rbonf = rep(0, nsim) RBONF = rep(0, nsim) RMULT = rep(0, nsim) rmulti = rep(0, nsim) timecode = rep(0, nsim) K=5 mu4 = c(0.3, 0, 0, 0, 0) mu3 = c(0.3, 0.3, 0, 0, 0) mu1 = c(.25, .25, .25, .25, .25) mu2 = c(.25, .25, .25, .25, 0) Sig1 = diag(5) Sig2 = matrix(.5, 5, 5) diag(Sig2) = 1 temp = rep(0, 4) mlist = list(mu1, mu1, mu2, mu2, mu3, mu3, mu4, mu4, mu1, mu1, mu2, mu2, mu3, mu3, mu4, mu4, mu1, mu1, mu2, mu2, mu3, mu3, mu4, mu4) Slist = list(Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2, Sig1, Sig2, Sig1, Sig2, Sig1, Sig2) Gvec = c(rep(1.25, 8), rep(1.5,8), rep(1.75, 8)) RBlist = list(RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF, RBONF,RBONF,RBONF,RBONF,RBONF,RBONF) RMlist = RBlist for(aa in 1:24) { RBONF = rep(0, nsim) RMULT = rep(0, nsim) Gam = Gvec[aa] I = 250 Sig = Slist[[aa]] mu = mlist[[aa]] for(mm in 1:nsim) { #simulate pairs mat = mvrnorm(I, mu, Sig) PO = rbind(mat, -mat) treatment = rep(c(T,F), each = I) index = rep(1:I, 2) OUT = PO QQ = OUT DIFF = QQ ns = rep(2, I) nostratum = I #Define Huber's M statistic for(j in 1:nostratum) { ind = which(index==j) DIFF[ind[1],] = OUT[ind[1],] - OUT[ind[2],] DIFF[ind[2],] = -DIFF[ind[1],] } sk = apply(abs(DIFF), 2, median) SK = matrix(sk, 2, K, byrow = T) for(j in 1:nostratum) { ind = which(index==j) for(jj in 1:length(ind)) { sg = sign(DIFF[ind,]) ad = abs(DIFF[ind,]/SK) ad[ad > 2.5] = 2.5 QQ[ind,] = sg*ad } } PO = QQ MG = rep(0, K) #Figure out if we can reject the overall null by separately conducting sensitivity analysis for(k in 1:K) { MG[k] = multipleComparisons(index, PO[,k], treatment, alpha = .05/K, Gamma = Gam)\$Reject[1] } if(any(MG ==1)) { RBONF[mm] = 1 } #While timing code, figure out if we can reject the overall null by considering all outcomes simultaneously ptm = proc.time() rm = multipleComparisons(index, PO, treatment, Gamma = Gam)\$Reject[1] if(rm == 1) { RMULT[mm] = 1 } timecode[mm] = (proc.time() - ptm)[3] print(c(aa,mm)) } RBlist[[aa]] = mean(RBONF) RMlist[[aa]] = mean(RMULT) } ###RBlist and RMlist Give the proportions of rejections for both procedures in the order given in Table 1 ############Simulation 2: Individual Tests, K=3 outcomes nsim = 10000 rbonf = rep(0, nsim) RBONF = matrix(0, nsim, 4) RMULT = matrix(0, nsim, 4) rmulti = rep(0, nsim) timecode = rep(0, nsim) K = 3 mu4 = c(0.15, .2, .35) mu3 = c(0.2, .25, .35) mu2 = c(.25, .3, .35) mu1 = c(.2, .225, .25) Sig1 = diag(3) Sig2 = matrix(.5, 3, 3) diag(Sig2) = 1 mlist = list(mu1, mu1, mu2, mu2, mu3, mu3, mu4, mu4, mu1, mu1, mu2, mu2, mu3, mu3, mu4, mu4, mu1, mu1, mu2, mu2, mu3, mu3, mu4, mu4) Slist = list(Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2,Sig1, Sig2, Sig1, Sig2, Sig1, Sig2, Sig1, Sig2) Gvec = c(rep(1.25, 8), rep(1.375,8), rep(1.5, 8)) RBlist = list(RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF,RBONF, RBONF,RBONF,RBONF,RBONF,RBONF,RBONF) RMlist = RBlist for(aa in 1:24) { RBONF = matrix(0, nsim, 4) RMULT = matrix(0, nsim, 4) Gam = Gvec[aa] I = 250 Sig = Slist[[aa]] mu = mlist[[aa]] for(mm in 1:nsim) { mat = mvrnorm(I, mu, Sig) PO = rbind(mat, -mat) treatment = rep(c(T,F), each = I) index = rep(1:I, 2) OUT = PO QQ = OUT DIFF = QQ ns = rep(2, I) nostratum = I #Define Huber's M statistic for(j in 1:nostratum) { ind = which(index==j) DIFF[ind[1],] = OUT[ind[1],] - OUT[ind[2],] DIFF[ind[2],] = -DIFF[ind[1],] } sk = apply(abs(DIFF), 2, median) SK = matrix(sk, 2, K, byrow = T) for(j in 1:nostratum) { ind = which(index==j) for(jj in 1:length(ind)) { sg = sign(DIFF[ind,]) ad = abs(DIFF[ind,]/SK) ad[ad > 2.5] = 2.5 QQ[ind,] = sg*ad } } PO = QQ MG = rep(0, K) ####Here we use a different approach for a sensitivity analysis, described in a submitted paper which is included as supplementary material. It allows us to get p-values for the sensitivity analysis without using asymptotic separability. Equivalently, one could have coded this based on our function "multipleComparison" as a closed testing procedure. for(k in 1:K) { MG[k] = sensitivity(index, PO[,k], treatment, alpha = .05/K, alternative = "two.sided", Gamma.vec = Gam, calculate.pval = T)\$pval } MGsort = sort(MG) ord = order(MG) if(MGsort[1] < .05/3) { RBONF[mm,ord[1]] = 1 RBONF[mm, 4] = 1 if(MGsort[2] < .05/2) { RBONF[mm,ord[2]] = 1 if(MGsort[3] < .05) { RBONF[mm,ord[3]] = 1 } } } ptm = proc.time() rm = multipleComparisons(index, PO, treatment, Gamma = Gam)\$Reject[1] if(rm == 1) { RMULT[mm,4] = 1 rm12 = multipleComparisons(index, PO[,-3], treatment, Gamma = Gam)\$Reject[1] rm23 = multipleComparisons(index, PO[,-1], treatment, Gamma = Gam)\$Reject[1] rm13 = multipleComparisons(index, PO[,-2], treatment, Gamma = Gam)\$Reject[1] if(rm12 == 1 & rm13 == 1) { RMULT[mm,1] = (MG[1] < .05) } if(rm12 == 1 & rm23 == 1) { RMULT[mm,2] = (MG[2] < .05) } if(rm13 == 1 & rm23 == 1) { RMULT[mm,3] = (MG[3] < .05) } } timecode[mm] = (proc.time() - ptm)[3] print(c(aa,mm)) } RBlist[[aa]] = colMeans(RBONF) RMlist[[aa]] = colMeans(RMULT) } ###RBlist and RMlist contain the estimated power for each situation, in the order of table 2