## Preamble, getting the data ready: ## Pulling out the first 82 SNPS : zcat CEU.low_coverage.2010_09.genotypes.vcf.gz | head -n 100 > temp.txt & ## Converting first 82 to IMPUTE format: vcftools --IMPUTE --vcf temp.txt ## Output files: ## out.impute.hap ## out.impute.hap.indv ## out.impute.legend ## (out.log) if(0) { source("runTreesimForPaper.r") reload() results = timeTreesim(machine="juggler") # reload(); results = timeTreesim(machine="juggler", run=TRUE, pause=FALSE) } #overRides = c("files$hapFname = \"/home/niall/Desktop/Documents/refData/1000genomes/subsetted.impute.haps_inds1-15.txt\"") # reload(); results = runTreesim(machine=machine, run=TRUE, pause=FALSE, calcLikelihood=FALSE, overRides=overRides, model="SMC", numRuns="1", firstNonrec="false", displayEvents="false", maxNumHaps = 2000, treesimVerbose=1.9, backGround = FALSE , fTag1=fTag1, fTag2=fTag2, numTreesimCalls=1, lowerPos=1e7, upperPos=1.001e7) # reload(); results = timeTreesim(machine="juggler") timeTreesim=function(machine="joels", likelihoodCurve=FALSE, largerData=TRUE, run=TRUE, pause=FALSE, upperPositions = makeRunDetails()$upperPositions, numHapsList = makeRunDetails()$numHapsList ) { files = getFileInfo(machine=machine, chromosome=NA) # Want to use small data, made the total genMap Dist = 1, kept things on the physical distance map. if(largerData) { fTag1 = "thouGenChr21HapGenSim.out.controls." fTag2 = "" #haps overRides = c("files$hapFname = paste(files$baseDir, \"refData/simDat/thouGenChr21HapGenSim.out.controls.haps\", sep=\"\")", "files$legFname = paste(files$baseDir, \"refData/1000genomes/allChrom21CEU.impute.legend\" , sep=\"\")", "files$genMapFname = paste(files$mapDir , \"hapmapFormatGenMap_thousandGmap_chr21.txt.gz\" , sep=\"\")" ) rho=0.1 results=c() for(model in c("SMC", "Coal")) { for(upperPos in upperPositions) { for(firstNonrec in c("true", "false")) { # Decided that I don't need to distinguish coalFast+FirstNonrec from coalFast-FirstNonrec for the paper coalFast = firstNonrec for(numHaps in numHapsList) { cat(" Running treesim on: ", model, "_", upperPos, "_fnr=", firstNonrec, "_", numHaps, "haps...", sep="") results = c(results, runTreesim(machine=machine, run=TRUE, pause=pause, calcLikelihood=FALSE, overRides=overRides , model=model, numRuns="1", firstNonrec=firstNonrec , coalFast=coalFast , displayEvents="false" , maxNumHaps = numHaps , rho=rho , treesimVerbose=1.9 , backGround = FALSE , buffer=0, fTag1=fTag1, fTag2=fTag2, numTreesimCalls=1, lowerPos="10000000", upperPos=upperPos) , sep="_nextrun_") cat("Done.\n") } } } } } if(likelihoodCurve) { overRides = c("files$hapFname = paste(files$datDir, \"subsetted.impute.haps_inds1-15_sites1-10.txt\", sep=\"\")", "files$legFname = paste(files$datDir, \"subsetted.impute.leg_sites1-10.txt\" , sep=\"\")", "files$genMapFname = paste(files$datDir, \"fakeGenMapFile_10sites.txt\" , sep=\"\")" ) runTreesim(machine=machine, run=FALSE, pause=FALSE, calcLikelihood=TRUE, overRides=overRides, model="SMC", numRuns="100000") } return(results) } # reload(); summariseTreeSimTimes(machine="juggler") summariseTreeSimTimes=function(machine="joels", seed=1, numRuns=1, verbose=1, upperPositions=makeRunDetails()$upperPositions, numHapsList = makeRunDetails()$numHapsList ) { #timeSummaries=matrix(NA, ncol=length(cols), nrow=length(rows)) coalFastAndNonRecs = c("true", "false") timeSummaries=array(NA, dim=c("coalFastAndNonRec"=length(coalFastAndNonRecs), #"Hap1st" =2, #"coalFast" =2, #"nonRec" =2, #"model" =2, "upperPos" =length(upperPositions), "numHaps" =length(numHapsList) )) dimnames(timeSummaries) = list("coalFastAndNonRec"=coalFastAndNonRecs, "upperPositions" =upperPositions , "numHapsList" =numHapsList ) #Also: could have loops for coalFast (though see below at (!)), and Hap1st for(model in c("SMC")) #c("SMC", "Coal")) { for(upperPos in upperPositions) { for(firstNonrec in coalFastAndNonRecs) { coalFast = firstNonrec #(!) for(maxNumHaps in numHapsList) { if(verbose>1.5) cat(" Running treesim on: ", model, "_", upperPos, "_fnr=", firstNonrec, "_", maxNumHaps, "haps...", sep="") files = getFileInfo(machine=machine, chromosome=NA, fTag1=NA, fTag2=NA) fileDetails = list("treeDir"=files$treeDir, "seed"=seed, "model"=model, "numRuns"=numRuns, "upperPos"=upperPos, "firstNonrec"=firstNonrec, "coalFast"=coalFast, "maxNumHaps"=maxNumHaps) oFiles = makeOutFilenames(details=fileDetails) outFile = oFiles$outFile timeOut = oFiles$timeOut timeLines = readLines(timeOut) # grep(pattern="User time (", timeLines) # !! <-- causes a weird error, post this? wantedLine = grep(pattern="User time ", timeLines) time = as.numeric(strsplit(timeLines[wantedLine], split=":")[[1]][2]) timeSummaries[firstNonrec, upperPos, as.character(maxNumHaps)] = time #d = fileDetails cols = c("n"=maxNumHaps, "u"=upperPos, model) rows = c("f="=firstNonrec) shortName = paste( paste(names(cols), cols, sep="", collapse=","), paste(names(rows), rows, sep="", collapse=","), "s", seed , ",", sep="") if(verbose>1.5) { cat("Done\n", sep="") print(aperm(timeSummaries, c(3,2,1))) } matResults = rbind(timeSummaries[1,,], timeSummaries[2,,]) rownames(matResults) = paste(rep(x=coalFastAndNonRecs, each=length(upperPositions)), names(upperPositions), sep="_") #rownames(matResults) } } } } # inData = matrix(1:9, 3); colnames(inData) = c("frank", "barry", "white") inData = cbind(rownames(matResults), matResults) makeLatexTable(header=colnames(matResults), inData=inData, tableFn=paste("/home/niall/Documents/myPapers/treesimPaper/treesAndResults/", "treesimTimingResults5Jan_LaTeXtable.tex", sep=""), pos="l") browser() } runTreesim=function(machine="juggler", run=TRUE, pause=TRUE, DEBUG=2, calcLikelihood=FALSE, buffer=10, numTreesimCalls=4, firstNonrec="false", coalFast="true", displayEvents="false", fTag1=NULL, fTag2=NULL, maxNumHaps=-1, rho=1e-9, treesimVerbose=2, outToFile=TRUE, overRides=NULL, model="SMC", numRuns=NULL, backGround=numTreesimCalls>1, lowerPos=1e7, upperPos=1.001e7) { results = "no results returned" for(seed in 1:numTreesimCalls) { com = prepareTreeSimLine(machine=machine, DEBUG=DEBUG, calcLikelihood=calcLikelihood, buffer=buffer, firstNonrec=firstNonrec, coalFast=coalFast, displayEvents=displayEvents, maxNumHaps=maxNumHaps, rho=rho, treesimVerbose=treesimVerbose, overRides=overRides, fTag1=fTag1, fTag2=fTag2, model=model, numRuns=numRuns, seed=seed, lowerPos=lowerPos, upperPos=upperPos) files = getFileInfo(machine=machine, chromosome=NA, fTag1=NA, fTag2=NA) fileDetails = list("treeDir"=files$treeDir, "seed"=seed, "model"=model, "numRuns"=numRuns, "upperPos"=upperPos, "firstNonrec"=firstNonrec, "maxNumHaps"=maxNumHaps) oFiles = makeOutFilenames(details=fileDetails) outFile = oFiles$outFile timeOut = oFiles$timeOut if(outToFile) com = paste(com, " > ", outFile, sep="") if(backGround) { com = paste(com, " & ", sep="") intern=FALSE } else { timeOut = paste(outFile, ".time", sep="") com = paste("time -v -o ", timeOut, " ", com, sep="") intern=TRUE } if(run) { results = system(com, intern=intern) } else { cat("\n", com, "\n\n") } } if(pause) browser() return(results) } makeOutFilenames=function(details) { d=details outFile = paste(d$treeDir, "treeSimProgress_seed", d$seed , "_" , d$model , "_numRuns" , d$numRuns , "_upperPos", d$upperPos , "_firstNonrec_" , d$firstNonrec, "_haps_" , d$maxNumHaps, ".out", sep="") timeOut = paste(outFile, ".time", sep="") rv = list("outFile"=outFile, "timeOut"=timeOut) return(rv) } makeRunDetails=function() { upperPositions = c("6SNPs"="10005000", "66SNPs"="10050000", "178SNPs"="10500000") numHapsList = c(20, 120, 2000) rv = list("upperPositions" = upperPositions, "numHapsList" = numHapsList ) return(rv) } ## (!!) # Does treesim read in the edge sites? ie the SNP at 124 if 124 is the lower bound? ## treesim line to make trees for this data: prepareTreeSimLine=function(machine="juggler", DEBUG=1.5, verbose=1.5, calcLikelihood = FALSE, buffer=10, pause=DEBUG>2, firstNonrec="false", coalFast="true", chooseNonRec=firstNonrec, #Don't need to distinguish chooseNonRec from firstNonrec here displayEvents="false", treesimVerbose=2, maxNumHaps=-1, rho=1e-9, overRides=NULL, model="SMC", fTag1=NULL, fTag2=NULL, numRuns=NULL, seed=NULL, lowerPos = 1000000 -0 , upperPos = 1010000 +0 ) { # Settign random seed: if(is.null(seed)) { cat(" Warning, no seed given so setting to 1, don't do multiple identical runs!\n") seed = 1 } # data details: chromosome = 1 #required to identify the right genetic map file. # File locations files = getFileInfo(machine, chromosome=chromosome, fTag1=fTag1, fTag2=fTag2) # Treesim arguments: # Calculate likelihood or not? if(calcLikelihood) { # These arguments go slowly to create the best genealogies the program can (for good likelihoods) cheapCoal = "false" if(is.null(numRuns)) numRuns = "10000" numRhoVals = 10 greedyMLE = "false" useLogs = "false" # There's no need with small data, and it's faster. writeLikelihoods = "true" } else { # These arguments go quickly to copy with large data cheapCoal = "false" # though it has been set to true, I now forget the performance. firstNonrec = firstNonrec if(is.null(numRuns)) numRuns = 1 numRhoVals = 0 if(numRuns==1) greedyMLE = "true" else greedyMLE = "false" useLogs = "true" writeLikelihoods = "false" } hapFirst = "true"; DEBUG = 1.5 #1.5 if(numRuns==1) { writeTrees = "true" collectTimes = "true" } else { writeTrees = "false" # Might want to write trees in this case, but need to consider that. collectTimes = "false" } if(model=="SMC") SMC="true" else SMC="false" runOutputTag = paste("timingLargeTrees_chr21_lower1e7_seed" , seed , "_upperPos", upperPos , "_firstNonrec_", firstNonrec, sep="") if(length(overRides)>0 ) { if(verbose>1.5 | DEBUG>1.5) cat(" Warning, about to directly override argument control: \n") for(arg in 1:length(overRides)) { #com = paste(names(overRides)[arg], " = ", [[arg]] eval(parse(text=overRides[arg])) } } # Creating the treesim line treesimLine = paste( files$treesimLoc , "-haplotypes " , files$hapFname , "-legend " , files$legFname , "-genMap " , files$genMapFname, # (!!) Need to check that the genetic map is the right build! Where can I get the 1000g one? "-lower " , lowerPos , "-upper " , upperPos , "-buffer " , buffer , # (!) What's going to happen as this buffer goes outside the range of the data? Should be fine? "-debugMaxNumHaps " , maxNumHaps , "-theta " , 1e-9 , #Not used as I'm setting program to use the Watterson estimator "-using_wattersons " , "true" , "-rho " , rho , "-SMC " , SMC , "-no_runs " , numRuns , "-greedy_mle " , greedyMLE , "-choose_hap_first " , hapFirst , "-coal_fast " , coalFast , "-choose_nonrec " , chooseNonRec , "-first_nonrec " , firstNonrec , "-cheap_coal_upwt " , cheapCoal , "-change_rho " , numRhoVals , "-display_events " , displayEvents , "-use_logs " , useLogs , "-outputDir " , files$treeDir , "-file_tag " , runOutputTag , "-write_trees " , writeTrees , "-write_likelihoods " , writeLikelihoods , "-collect_times " , collectTimes , "-set_seed " , seed , "-DEBUG " , DEBUG , "-verbose " , treesimVerbose , sep=" " # sep=" \\ \n" - if I want the command broken over multiple lines? Or do that explicitly? ) # In general, what happens if you ask for SNPs that aren't there? It's fine, no warning, but no answers either? } # source("randomTools.r"); preLoad(); source(paste(getScriptFnames(machine=machine)$.scriptsRoot, "runTreesimForPaper.r",sep="")) # source("runTreesimForPaper.r") reload=function() { if(!exists("convertGenMapToHapMap")) source("doJobsWithR.r") source("randomTools.r") source("mathFns.r") source("runTreesimForPaper.r") preLoad() } ## treesim line to make trees for this data: getFileInfo=function(machine="juggler" , fTag1 = NULL , fTag2 = NULL , chromosome = "chrNotSupplied" ) { if(is.null(fTag1)) fTag1 = "subsetSites" if(is.null(fTag2)) fTag2 = ".impute." # File locations if(machine=="juggler") { baseDir = "/home/niall/Documents/" treesimTag = "programming/eclipse/" # /home/niall/Documents/programming/eclipse imputeF1000gTag = "refData/1000genomes/" # this is the specific sub-directory that the impute format 1000genomes data that I've made is held in. #genMapTag = "refData/hapmap_geneticMap/" # this is the specific sub-directory that holds the genetic map file (1000g genMap has complicated format) thousGMdir = "refData/1000genomesGenMap/" treeOutTag = "myPapers/treesimPaper/treesAndResults/" # this is the specific sub-directory that holds the genetic map file } else if(machine=="lodz") { baseDir = "/data/vault1/cardin/" treesimTag = "runTreesim/" imputeF1000gTag = "1000genomes/processedData/" #genMapTag = "refData/hapmap_geneticMap/" #"1000genomes/geneticMap/" isn't formatted the right way yet #"refData/hapmap_geneticMap/" has expected format thousGMdir = "refData/1000genomesGenMap/" treeOutTag = "myResults/treesim/" } else if(machine=="joelsWork") { baseDir = "/Users/wittelab/cardin/" treesimTag = "runTreesim/" imputeF1000gTag = "refData/1000genomes/" #genMapTag = "refData/hapmap_geneticMap/" #"1000genomes/geneticMap/" isn't formatted the right way yet #"refData/hapmap_geneticMap/" has expected format thousGMdir = "refData/1000genomesGenMap/" treeOutTag = "myResults/treesim/" } else { cat(" Machine, ", machine, ", not recognised.\n", sep="") browser() } treesimLoc = paste(baseDir, treesimTag, "treesim", sep="") datDir = paste(baseDir, imputeF1000gTag , sep="") mapDir = paste(baseDir, thousGMdir , sep="") treeDir = paste(baseDir, treeOutTag , sep="") hapFname = paste(datDir, fTag1, fTag2, "hap" , sep="") legFname = paste(datDir, fTag1, fTag2, "legend", sep="") genMapFname = paste(mapDir, "genetic_map_chr", chromosome, "_b36.txt", sep="") rv = list("baseDir" = baseDir , "treesimLoc" = treesimLoc , "datDir" = datDir , "mapDir" = mapDir , "thousGMdir" = thousGMdir , "treeDir" = treeDir , "hapFname" = hapFname , "legFname" = legFname , "genMapFname" = genMapFname) return(rv) } subsetImputeFiles=function(machine="lodz", DEBUG=1, inds=1:15, sites=1:10) { files = getFileInfo(machine=machine, fTag1="subsetSites", chromosome=1) hapsFname = files$hapFname legFname = files$legFname haps = read.table(hapsFname, as.is=TRUE, header=FALSE) legend = read.table(legFname , as.is=TRUE, header=TRUE ) outHapsFname = paste(files$datDir, "subsetted.impute.haps", sep="") if(!is.null(inds)) { cat(" Subsetting by individuals:\n") subsetHaps = haps[, inds, drop=FALSE] outHapsFname = paste( outHapsFname, "_inds", min(inds), "-", max(inds), sep="") } else { subsetHaps = haps } if(!is.null(sites)) { cat(" Subsetting by sites:\n") subsetHaps = subsetHaps[sites, , drop=FALSE] subsetLeg = legend[ sites, , drop=FALSE] outHapsFname = paste(outHapsFname, "_sites", min(sites), "-", max(sites), ".txt", sep="") outLegFname = paste(files$datDir, "subsetted.impute.leg", "_sites", min(sites), "-", max(sites), ".txt", sep="") write.table(subsetLeg, file=outLegFname, quote=FALSE, row.names=FALSE, col.names=TRUE) } else { outHapsFname = paste(outHapsFname, ".txt", sep="") subsetHaps = subsetHaps } write.table(subsetHaps, file=outHapsFname, quote=FALSE, row.names=FALSE, col.names=FALSE) } # reload(); rv = combineMultipleTreeSimRuns() combineMultipleTreeSimRuns=function(numRuns="100000", model= "SMC", numReps=4, analysisTag="unitGenMapMultiRun", DEBUG=1.9, pause=DEBUG>=2) { treesimOuts = as.list(rep(NA, numReps)) names(treesimOuts) = paste("rep", 1:numReps, sep="") for(rep in 1:numReps) { analysisName = paste(analysisTag, rep, sep="") treesimOuts[[rep]] = parseOutput(numRuns=numRuns, model=model, pause=FALSE, tag = analysisName) } rhos = as.numeric(treesimOuts[[1]]$basicResults[, "Rho" ]) basicResults = sapply(treesimOuts, '[', "basicResults") #, "AverageWeight"]) logLikes = matrix(unlist(sapply(basicResults, '[', "AverageWeight")), ncol=length(rhos), byrow=TRUE) colnames(logLikes) = rhos ESSes = matrix(unlist(sapply(basicResults, '[', "ESS" )), ncol=length(rhos), byrow=TRUE) colnames(ESSes) = rhos combStats = combineAllValuesOfRho( rhos = rhos , numRunsVec = as.numeric(rep(numRuns, numReps)), LikelihoodsMat = logLikes , ESSesMat = ESSes , logLikes = TRUE , realVals = list() ) bridgeRhos = as.numeric(treesimOuts[[1]]$bridgeRhos) bridgeCurves = sapply(treesimOuts, '[', "bridgeCurves") bridgeESSes = sapply(treesimOuts, '[', "bridgeESSs" ) bridgeCombStats = as.list(rep(NA, length(rhos))) for(drivingValue in 1:length(rhos)) { bridgeLogLikes = matrix(unlist(sapply(bridgeCurves, '[', as.character(rhos[drivingValue]))), ncol=length(bridgeRhos), byrow=TRUE) names(bridgeLogLikes) = bridgeRhos bridgeESSesAtDV = matrix(unlist(sapply(bridgeESSes , '[', as.character(rhos[drivingValue]))), ncol=length(bridgeRhos), byrow=TRUE) colnames(bridgeESSesAtDV) = bridgeRhos bridgeCombStats[[drivingValue]] = combineAllValuesOfRho( rhos = bridgeRhos , numRunsVec = as.numeric(rep(numRuns, numReps)), LikelihoodsMat = bridgeLogLikes , ESSesMat = bridgeESSesAtDV , logLikes = TRUE , realVals = list() ) } # Collect info into structure required for z combStats = cbind(as.numeric(rownames(combStats)), combStats) colnames(combStats)[1] = "Rho" bridgeCurves = lapply(bridgeCombStats, function(y) y[,1]) names(bridgeCurves) = rownames(combStats) totNumRuns = as.character(as.numeric(numRuns) * numReps) parsedDat = treesimOuts[[1]] parsedDat[["basicResults"]] = combStats parsedDat[["numRuns" ]] = totNumRuns parsedDat[["tag" ]] = analysisTag parsedDat[["bridgeCurves"]] = bridgeCurves parsedDat[["bridgeESSs" ]] = "notPassed" # Plot the now parsed and collated outputs: displayTreesimLikelihoods(parsedDat, png=TRUE, pause=FALSE, fadeRate=1, plotLogRho=FALSE, removeZero=TRUE, likeColName="lLikelihood") if(pause) browser() rv = list("parsedDat" = parsedDat , "mainResults" = combStats , "bridgeResults" = bridgeCombStats ) return(rv) } # source("runTreesimForPaper.r"); parseAndDisplayOutput(parsedDat=parsedDat, numRuns=500) displayTreesimLikelihoods=function(parsedDat, png=FALSE, pause=TRUE, fadeRate=1, maxOpac=60, lwd=1.75, pngWidth=800+(plotLogRho*500), pngHeight=800, removeZero=FALSE, addYmax=1, likeColName="AverageWeight", plotLogRho=FALSE, cex.main=2.0, cex.lab=cex.main-0.5, cex.axis=cex.main-0.5) { p = parsedDat numRhos = nrow(p$basicResults) if(png) { plotFname = paste(p$resultsDir, "likelihoodCurves", "_numSites=", p$numSites , "_numInds=" , p$numInds , "_numRuns=" , p$numRuns , "_Data_Set=", p$dataSet , "_Analysis=", p$tag , ".png" , sep="" ) png(file=plotFname, width=pngWidth, height=pngHeight) } if(removeZero) plotPts = (1:nrow(p$basicResults))[-1] else plotPts = (1:nrow(p$basicResults)) ylim = c(min(p$basicResults[plotPts, likeColName]), max(p$basicResults[plotPts, likeColName])+addYmax) ylab = "log likelihood" par(mfcol=c(1,1+plotLogRho)) if(plotLogRho) { plot( log(p$basicResults[plotPts,"Rho"]), p$basicResults[plotPts, likeColName], xlab = "log(Rho)", typ="l", col="red", ylab=ylab, cex.main=cex.main, cex.lab=cex.lab, cex.axis=cex.axis, main = "Profile log likelihood curve for log(rho)", lwd=lwd, ylim=ylim) points(log(p$basicResults[plotPts,"Rho"]), p$basicResults[plotPts, likeColName], pch="x") for(rhoIndex in 1:numRhos) { rho = p$basicResults[rhoIndex,"Rho"] charRho = as.character(rho) efficiency = maxOpac * (pmin((p$bridgeRhos/rho) ,(rho/p$bridgeRhos) ))^(fadeRate) tmp = paste("0", ceiling(efficiency), sep="") colStrength = substr(tmp, start=nchar(tmp)-1, stop=nchar(tmp)) cols = paste("#0000FF", colStrength, sep="") #lines(log(p$bridgeRhos), p$bridgeCurves[[charRho]], cols) #lines can't handle multiple colours multiColLines(x=log(p$bridgeRhos), y=p$bridgeCurves[[charRho]], col=cols, chop="first", lwd=lwd, score=efficiency) } } plot((p$basicResults[plotPts,"Rho"]), p$basicResults[plotPts, likeColName], xlab = "Rho", typ="l", col="red", ylab=ylab, cex.main=cex.main, cex.lab=cex.lab, cex.axis=cex.axis, main = "Profile log likelihood curve for rho", lwd=lwd, ylim=ylim) points((p$basicResults[plotPts,"Rho"]), p$basicResults[plotPts, likeColName], pch="x") for(rhoIndex in 1:numRhos) { rho = p$basicResults[rhoIndex,"Rho"] charRho = as.character(rho) efficiency = maxOpac * (pmin((p$bridgeRhos/rho) ,(rho/p$bridgeRhos) ))^(fadeRate) tmp = paste("0", ceiling(efficiency), sep="") colStrength = substr(tmp, start=nchar(tmp)-1, stop=nchar(tmp)) cols = paste("#0000FF", colStrength, sep="") #lines(log(p$bridgeRhos), p$bridgeCurves[[charRho]], cols) #lines can't handle multiple colours multiColLines(x=p$bridgeRhos, y=p$bridgeCurves[[charRho]], col=cols, chop="first", lwd=lwd, score=efficiency) } if(png) dev.off() if(pause) browser() } # reload(); testCombiningCode() testCombiningCode=function(logLikes=TRUE) { # Test my ESS and likelihood code: # This code assumes the same number of runs each time - don't need to be too general though!!! numReps = 3 numRhos = 4 numRuns = 5 rhos = as.character((1:numRhos)+0.12) # Ensure that the raw likelihoods are positive allLikelihoods = exp(array(rnorm(numReps* numRhos *numReps), dim=c(numReps, numRhos, numRuns))) #allLikelihoods = array(1:(numReps* numRhos *numReps), # dim=c(numReps, numRhos, numRuns)) dimnames(allLikelihoods)[[2]] = rhos meanLikes = apply(allLikelihoods , MARGIN=c(1,2), mean) colnames(meanLikes) = rhos # Naive calculation of the combined runs values inOneGoLikes = matrix(NA, nrow=0, ncol=ncol(allLikelihoods)) for(i in 1:dim(allLikelihoods)[3]) inOneGoLikes = rbind(inOneGoLikes, allLikelihoods[,,i,drop=TRUE]) iogMeanLikes = apply(inOneGoLikes , MARGIN=2, mean) iogSumSqLikes = apply(inOneGoLikes^2, MARGIN=2, sum ) iogMeanSqLikes = apply(inOneGoLikes^2, MARGIN=2, mean) iogVarLikes = iogMeanSqLikes - iogMeanLikes^2 iogCoefOfVarSq = iogVarLikes/iogMeanLikes^2 iogESSes = nrow(inOneGoLikes)/(1+iogCoefOfVarSq) names(iogESSes) = rhos names(iogMeanLikes) = rhos #vec=c(1,7); N=length(vec); sumSq=((N-1)/N)*var(vec) + mean(vec)^2 # Creating the half-way values that treesim would output, to check my functions meanSqLikes = apply(allLikelihoods^2, MARGIN=c(1,2), mean) varLikes = meanSqLikes - meanLikes^2 coefOfVarSq = varLikes/meanLikes^2 ESSes = numRuns/(1+coefOfVarSq) colnames(ESSes) = rhos if(logLikes) { inLikes = log(meanLikes) } else { inLikes = meanLikes } combStats = combineAllValuesOfRho( rhos = rhos , numRunsVec = rep(numRuns, numReps), LikelihoodsMat = inLikes , ESSesMat = ESSes , logLikes = logLikes , realVals = list("iogMeanSqLikes" = iogMeanSqLikes, "iogESSes" = iogESSes , "iogMeanLikes" = iogMeanLikes , "iogSumSqLikes" = iogSumSqLikes , "meanLikes" = meanLikes , "meanSqLikes" = meanSqLikes , "varLikes" = varLikes ) ) cat(" Stats from combining code:\n") print(combStats) cat(" Stats from doing it in one go (iog):\n") print(cbind(iogMeanLikes, iogESSes)) browser() } # To combine multiple runs of the program want to combine the ESS to get estimates of overall ESS # ESS = N/(1+V_c^2), where V_c=coefficient of variation. essTosumSq=function(N, mean, ess, logLikes=TRUE, returnAll=TRUE, DEBUG=1+returnAll, tol=1e-10, extra=list()) { #sumSq = (N*mean)*(N/ess - 2) # <---- utter nonsense #sumSq = mean * (N/ess - 1 + mean/N) # <---- more nonsense #sumLSq = (mean*N)^2 if(is.na(sum(c(N, mean, ess)))) { cat(" Error, na input in essTosumSq.\n") browser() } if(logLikes) { # Note that in this case mean=log(mean(likelihood estimates)) simpleLsumSq = log((N*exp(mean))^2/ess) lSumSq = 2*(mean + log(N)) - log(ess) checkCalc = abs(simpleLsumSq-lSumSq)> sumSq calc may be wrong.\n") browser() } return(lSumSq) } else { sumSq = (N*mean)^2/ess if((sumSq/N) < mean^2) { cat(" Error, cannot have mean(squares of real numbers) less than mean^2.\n") browser() } if(returnAll | (DEBUG>=2)) { VcSq = (N/ess-1) #This has a minimum at zero when all measurements are the same (hence N==E) var = VcSq*(mean^2) checkSumSq = N*(var + mean^2) if(!abs(checkSumSq-sumSq)0 & DEBUG>2) { print(extra) #cat("\n") #for(i in 1:length(extra)) { # cat((names(extra)[i]), ": \n") # print(extra[[i]]) #} } if(DEBUG>1) browser() if(returnAll) return(list("sumSq"=sumSq, "VcSq" =VcSq , "var" =var , "checkSumSq"=checkSumSq)) } } return(sumSq) } cat(" Uh oh... unexpected situation.\n") browser() return(NA) } calcEss=function(N, mean, sumSq, biasCorrect=FALSE, logLikes=TRUE) { if(logLikes) { cat(" ESS from log-likelihoods not yet implemented!\n") browser() } else { #sd = sqrt(sumSq-mean^2) var = (sumSq/N)-mean^2 if(biasCorrect) var = (N)/(N-1) * var ess = N/(1+(var/mean^2)) } cat(" Function has not been tested, check answers!\n") browser() return(ess) } # This function combines the stats for a single value of rho combineStats=function(sSizes, Likelihoods, ESSes, logLikes=TRUE, realVals=list(), rhoIndex=NULL, rhos=NULL, tol=1e-10) { if(logLikes) { #Likelihoods are actually logs of (sums of) likelihoods in this case lLikelihood = addLog2(Likelihoods+log(sSizes)) - log(sum(sSizes)) likelihood = exp(lLikelihood) # Calculate the ESS combined accross each execution of treesim lSumSqs = mapply(FUN=essTosumSq, N=sSizes, mean=Likelihoods, ess=ESSes, MoreArgs=list("extra"=realVals, "logLikes"=logLikes)) lTotSumSq = addLog2(lSumSqs) tmp = addLog2(lSumSqs)-log(sum(sSizes)) lTotVar = subtractLog(tmp, 2*lLikelihood) #((1/sum(sSizes))*totSumSq) - likelihood^2 CofVarSq = exp(lTotVar-(2*lLikelihood)) totSumSq = exp(lTotSumSq) totVar = exp(lTotVar) } else { # Calculate the likelihood combined accross each execution of treesim likelihood = (1/sum(sSizes)) * sum(Likelihoods*sSizes) lLikelihood = log(likelihood) # Calculate the ESS combined accross each execution of treesim sumSqs = mapply(FUN=essTosumSq, N=sSizes, mean=Likelihoods, ess=ESSes, MoreArgs=list("extra"=realVals, "logLikes"=logLikes)) totSumSq = sum(sumSqs) totVar = ((1/sum(sSizes))*totSumSq) - likelihood^2 CofVarSq = totVar/likelihood^2 } ess = sum(sSizes)/(1+CofVarSq) if(length(realVals)>0) { if(abs(totSumSq-realVals[["iogSumSqLikes"]][rhoIndex])>tol) { cat(" Why don't the values agree?\n") cat(" Combined sumSq = ", totSumSq, ", from \'in one go\':", realVals[["iogSumSqLikes"]][rhoIndex], "\n") browser() } } rv = list("likelihood" = likelihood , "lLikelihood" = lLikelihood, "ess" = ess , "totSumSq" = totSumSq , "totVar" = totVar , "CofVarSq" = CofVarSq ) return(rv) } combineAllValuesOfRho=function(LikelihoodsMat, ESSesMat, numRunsVec, logLikes=TRUE, rhos, realVals=list()) { checkDims = identical(rhos, colnames(LikelihoodsMat)) combResults = matrix(NA, ncol=2, nrow=length(rhos)) dimnames(combResults) = list(rhos, c("lLikelihood", "ess")) for(rhoIndex in 1:length(rhos)) { tmp = unlist(combineStats( sSizes = numRunsVec , Likelihoods = LikelihoodsMat[, rhoIndex], ESSes = ESSesMat[, rhoIndex] , realVals = realVals , rhoIndex = rhoIndex , logLikes = logLikes , rhos = rhos )) #combResults[rhoIndex, ] = tmp[c(" likelihood", "ess")] combResults[rhoIndex, ] = tmp[c("lLikelihood", "ess")] #browser() } return(combResults) } # source("runTreesimForPaper.r"); parseOutput(numRuns=90001, png=TRUE, model="Coal", pause=TRUE) parseOutput=function(machine="juggler", examineLikelihoods=TRUE, numInds=15, numRuns=50000, tag="test", dataSet=1, model="SMC", numSites=10, pause=FALSE) { files = getFileInfo(machine=machine) resultsDir = files$treeDir #"/home/niall/Desktop/Documents/myPapers/treesimPaper/treesAndResults/", "/home/niall/Documents/Rprograms/" inFname = paste(resultsDir , "Likelihoods_", model , "_numSites=", numSites , "_numInds=" , numInds , "_numRuns=" , numRuns , "_Data_Set=", dataSet , "_Analysis=", tag , ".txt" , sep="" ) lines = readLines(inFname) posnsLine = which(lines=="The site positions were at: ")+1 posns = as.numeric(strsplit(lines[posnsLine], split=" ")[[1]]) cat(" Reading in main results...") resultsStart = which(sapply(lines, substr, start=1, stop=10)==substr("Rho\t\tAverage Weight", start=1, stop=10)) resultsEnd = which(lines=="")[which(lines=="")>resultsStart][1]-1 numRhos = resultsEnd-resultsStart basicResults = read.table(inFname, as.is=TRUE, header=TRUE, skip=resultsStart-1, nrow=numRhos, sep="") cat(" Done\n") cat(" Reading in bridge sampling results...") bridgeSstart = which(lines=="Bridge Sampling likelihoods: ")+1 bridgeRhos = as.numeric(unlist(read.table(inFname, as.is=TRUE, header=FALSE, skip=bridgeSstart, nrow=1, sep="")[-1])) names(bridgeRhos) = NULL bridgeCurves = as.list(rep(NA, numRhos)) names(bridgeCurves) = basicResults[,"Rho"] bridgeESSs = bridgeCurves currentLine = bridgeSstart for(rhoIndex in 1:numRhos) { rho = as.character(basicResults[rhoIndex,"Rho"]) tmp = sapply(lines[currentLine:(currentLine+10)], strsplit, split=":") firstWords = sapply(tmp, '[', 1) names(firstWords)=NULL likelihoodLine = currentLine + min(which(firstWords=="Likelihood")-1) ESSline = currentLine + min(which(firstWords=="ESS")-1) bridgeCurves[[rho]] = as.numeric(unlist(read.table(inFname, as.is=TRUE, header=FALSE, skip=likelihoodLine-1, nrow=1, sep="")[-1])) bridgeESSs[[ rho]] = as.numeric(unlist(read.table(inFname, as.is=TRUE, header=FALSE, skip=ESSline -1, nrow=1, sep="")[-1])) names(bridgeCurves[[rho]]) = bridgeRhos names(bridgeESSs[[ rho]]) = bridgeRhos if(rhoIndex < numRhos) currentLine = currentLine + min(which(substr(firstWords[-1], 1, 6)=="Values")) } cat(" Done\n") rv = list("resultsDir" = resultsDir , "tag" = tag , "numSites" = numSites , "numInds" = numInds , "numRuns" = numRuns , "dataSet" = dataSet , "basicResults" = basicResults, "bridgeRhos" = bridgeRhos, "bridgeCurves" = bridgeCurves, "bridgeESSs" = bridgeESSs ) if(pause) browser() return(rv) } if(0) { # Old Eclipse Line # -haplotypes /home/niall/Documents/programming/eclipse/testFiles/testHaps1.txt -legend /home/niall/Documents/programming/eclipse/testFiles/testLegend1.txt -genMap /home/niall/Documents/programming/eclipse/testFiles/testMap1.txt -upper 10005000 -lower 10000000 -buffer 1 -Ne 1e4 -debugMaxNumHaps 10 -display_haps false -display_events false -writeType "genecluster" -outputDir /home/niall/Documents/programming/eclipse/test_output/ # -haplotypes /home/niall/Documents/refData/1000genomes/test.impute.hap -legend /home/niall/Documents/refData/1000genomes/test.impute.legend -genMap /home/niall/Documents/refData/hapmap_geneticMap/genetic_map_chr1_b36.txt -lower 533 -upper 746577 -buffer 50 -outputDir /home/niall/Documents/myPapers/treesimPaper/treesAndResults/ -DEBUG 1 # -haplotypes /home/niall/Documents/refData/1000genomes/subsetted.impute.haps_inds1-15.txt -legend /home/niall/Documents/refData/1000genomes/subsetSites.impute.legend -genMap /home/niall/Documents/refData/hapmap_geneticMap/genetic_map_chr1_b36.txt -lower 1000000 -upper 1010000 -buffer 50 -outputDir /home/niall/Documents/myPapers/treesimPaper/treesAndResults/ -DEBUG 1.9999 -rho 0.001 -choose_hap_first TRUE -coal_fast TRUE -greedy_mle TRUE -cheap_coal_upwt FALSE -choose_nonrec TRUE -no_runs 1 -change_rho 0 -DEBUG 1.5 -use_logs TRUE cat(" -legend /home/niall/Documents/programming/eclipse/testFiles/testLegend1.txt \\\n", " -haplotypes /home/niall/Documents/programming/eclipse/testFiles/testHaps1.txt \\\n", " -genMap /home/niall/Documents/programming/eclipse/testFiles/testMap1.txt \\\n", " -upper 10005000 -lower 10000000 -buffer 1 -Ne 1e4 \\\n", " -debugMaxNumHaps 10 -display_haps false -display_events false \\\n", " -writeType \"genecluster\" \\\n", " -outputDir /home/niall/Documents/programming/eclipse/test_output/\n" ) } interest=function(annualRate=5/100, depositAmount=10000, time=0.5) { (1 + annualRate)^time * depositAmount } # Main data from # ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/paper_data_sets/a_map_of_human_variation/low_coverage/snps/ # vcftools from: # http://vcftools.sourceforge.net/options.html # Genetic map (not the right format for treesim) from # ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/paper_data_sets/a_map_of_human_variation/supporting_data/recombination_hotspots/ # Just some individuals using vcftools perl scripts (which I haven't worked out how to download) # # vcf-subset -c NA0001,NA0002 file.vcf.gz | bgzip -c > out.vcf.gz # vcf-subset -c NA06985 NA06986 NA06994 NA07000 NA07037 NA07051 NA07346 NA07347 NA07357 NA10847 NA10851 NA11829 NA11830 NA11831 NA11832 CEU.low_coverage.2010_09.genotypes.vcf.gz | bgzip -c > first15Inds.vcf.gz # Get a range of sites: # * --from-bp # * --to-bp # Specific example: # zcat CEU.low_coverage.2010_09.genotypes.vcf.gz | head -5018 > first5018lines_CEU.low_coverage.2010_09.genotypes.vcf # vcftools --vcf first5018lines_CEU.low_coverage.2010_09.genotypes.vcf --out subsetSites --chr 1 --from-bp 1000000 --to-bp 1010000 --IMPUTE # Bigger example: if(0) { x = read.table("CEUlowCoverGens_snpInfo_noGumpf.txt", as.is=TRUE, header=TRUE) chr21=which(x[,1]==21) cat(" Need to use: head -n ", max(chr21)+19, " | tail -n ", length(chr21)) # (forget why I need 19 and not 18) #7623285-7514143(+19) } # 7623285 + 19 = 7623304: # nice zcat CEU.low_coverage.2010_09.genotypes.vcf.gz | head -n 7623304 | tail -n 109143 > allChrom21VCF.txt # nice cat mainVCFcolnames.txt allChrom21VCF.txt > allChrom21VCF_withHeader.txt # nice cat VCFheaderCEU.txt allChrom21VCF.txt > allChrom21VCF_withGumpf.txt # nice mv allChrom21VCF.txt probablyPointless # vcftools \ # --vcf ../allChrom21VCF_withGumpf.txt \ # --out allChrom21CEU \ # --chr 21 \ # --from-bp 0 --to-bp 50000000 \ # --IMPUTE # # Already done: # nice zcat CEU.low_coverage.2010_09.genotypes.vcf.gz | head -n 18 | tail -n 1 > mainVCFcolnames.txt # nice zcat CEU.low_coverage.2010_09.genotypes.vcf.gz | head -n 18 > VCFheaderCEU.txt # Plot tree code #_______________________________________________________________________________________- #The code below is a very slight edit of Zhan Su's code at: #The tree plotting code is based on code by Gil McVean ## Zhan Su 08/2008 ## This is a program that plots the GENECLUSTER signals across a region ## ## The main function in this program is make.plot ## make.plot calls 4 functions: ## 1. plot.signal, which plots the BF across the region; ## 2. plot.tree.file, which reads the tree file at the focal position ## (the location with the largest focal.test() BF, formats the data in ## that file and pass it onto to plot.tree.horizontal, which plots the ## tree and marks the best fitting mutations; ## 3. plot.panel, which plots the haplotype panel and colours them ## according to the alleles at the SNPs of the rs ids (if supplied) ## 4. plot.counts, which plots the contingency table of expected ## haplotype counts under the best fitting mutations ## ## make.plot also calls run.analyse.wtccc, which runs ANALYZE_MATRIX ## with the -detail flag, to find the best fitting mutations at the ## focal position. The map_files required for input and the output ## files are stored in a temporary directory, which is deleted once ## all the plots have been made. ## ## Before running make.plot, make sure that ## - analyse.executable is pointing to the right GC3 ## executable file ## - list.gc.tests contains all the tests that you want to plot ## - focal.test is the test that you want to use to determine the ## focal position ## - you run the make.plot function in a directory that you have write ## access to (so that a temporary folder can be created to run GC3) ## - you have the GC2 output files in the matrix directories ## specified ## executable to run ANALYZE_MATRIX analyse.executable <- function(){ "./GC3" } ## the colour of the mutations that will be marked onto the plotted ## tree, so that mutation.colours()[[i]][j] is the colour of the jth mutation ## in the best fitting i-mutation model mutation.colours <- function() { return(list(c("blue"), c("red", "green"), c("magenta", "cyan", "yellow"))) } mutation.symbols <- function() { c("large dot", "small dot", "triangle") } ## label of the test (column name in the ANALYZE_MATRIX result file) ## that is used to determines the focal position in a plot focal.test <- function(){ "mut2" } ## the first row are the labels of tests (column names in the ## GC3 result file) in that will be plotted ## the second row are the corresponding labels of those tests ## that will be displayed in the legend of the signal plot list.gc.tests <- function(plot.3.mut = F) { if(plot.3.mut){ return(rbind(c("mut1", "mut2", "mut3"), c("1-mutation", "2-mutation", "3-mutation"))) } else{ return(rbind(c("mut1", "mut2"), c("1-mutation", "2-mutation"))) } } ## main function for plotting GENECLUSTER signal plots ## - gc.file is the GC3 file that contains the BFs across ## the region, which will be plotted in the upper left panel and will ## be used to determine the focal position ## - control.gc2.files are the control gc2 output files ## - case.gc2.files are the case gc2 output files ## - delete.temp.files determines whether the temporary files produced by ## this script should be deleted ## - extra cluster is a set of haplotypes (labelled from 1 to n, where n is ## the number of panel haplotypes, according to the order they come in ## the panel.file) that will be given a separate colour, irrespective of ## their alleles at the SNPs with the specified rsids (if any are ## specified) ## - focal.posn is the position where a tree will be plotted and the ## contingency tables made, if null the position with the largest ## 2-mutation model Bayes factor will be choosen ## - impute.data are the imputed BFs to be plotted in comparison with the ## GENECLUSTER Bayes factors, it must be a matrix with 2 coloumns, the ## first are physical locations and the second are the imputed Bayes ## factors (not logged) ## - legend.file is the legend file for the panel haplotypes ## - min.location is the lower boundary of the region that will be plotted ## - max.location is the upper boundary of the region that will be plotted ## - panel.file is the reference panel that will be plotted in the ## bottom left panel and also used to find the best fitting ## mutations ## - plot.title is the title for the plot ## - rates.file is plotted in the lower half of the upper left panel ## and will be used to find the best fitting mutations ## - tree.dir is the tree directory that contains the set of trees ## across the region, the tree file names for each location is ## listed in tree_map.txt. The tree at the focal position is plotted ## in the lower right panel, and is also used to find the best ## fitting mutations. FILES SHOULD NOT BE ZIPPED ## - run.3.mut specifies whether the best fitting 3-mutations will be ## displayed on the tree in the lower right hand panel, if TRUE then ## the plotting process will take a little longer since implementing ## the 3-mutation model is computationally intensive ## - rp is the penetrance prior that will be used to find the best ## fitting mutations ## - rs.ids, if not NULL, then will colour the haplotypes, displayed ## in the bottom left panel, according to alleles at SNPs with those ## rs ids, these rs.ids must be in the legend file make.plot <- function(gc.file = NULL, control.gc2.files = NULL, case.gc2.files = NULL, delete.temp.files = T, extra.cluster = NULL, focal.posn = NULL, impute.data = NULL, label.snps = rs.ids, legend.file = NULL, log.file = NULL, max.location = NULL, min.location = NULL, panel.file = NULL, plot.3.mut = T, plot.title = "", colour.snps = NULL, rates.file = NULL, rp = NULL, rs.ids = NULL, run.gc3 = T, run.3.mut = FALSE, snptest.data = NULL, temp.dir = NULL, tree.file = NULL) { ## set a temporary working directory to place tempoarary files cat("\nWelcome to make.plot (part of the GENECLUSTER software)\n\n") if(is.null(gc.file) | is.null(control.gc2.files) | is.null(case.gc2.files) | is.null(panel.file) | is.null(legend.file) | is.null(rates.file) | is.null(tree.file)){ cat("Error: must specify gc.file, control.gc2.files, case.gc2.files, panel.fle, legend.file, tree.file, rates.file and rp") } if(!is.null(log.file)){ cat("Deleting log file", log.file, "\n") file.remove(log.file) cat("Results will be written to log file", log.file, "\n") log.file.con = file(log.file, open="a") writeLines(paste("gc results file is", gc.file), con = log.file.con) writeLines(paste("focal region is", min.location, max.location), con = log.file.con, sep="\n\n") } if(is.null(temp.dir)){ temp.dir = file.path(getwd(), paste("temp", runif(1), sep="")) while(file.exists(temp.dir)){ temp.dir = file.path(getwd(), paste("temp", runif(1), sep="")) } } if(!file.exists(temp.dir)){ cat("Creating a temporary directory ... ") dir.create(temp.dir, recursive = T) cat("done\n") } cat("temporary directory set to", temp.dir, "\n") ## read results file gc.results = read.table(file = gc.file, header = T) ## find region of interest in.region = gc.results[,"location"] >= min.location & gc.results[,"location"] <= max.location gc.results = gc.results[in.region,] ## find focal region (location with largest 2-mutation BF) if(is.null(focal.posn)){ focal.posn = gc.results[which.max(gc.results[,focal.test()]), "location"] } if(isOpen(log.file.con)){ writeLines(paste("focal position is", focal.posn), con = log.file.con, sep="\n\n") close(log.file.con) } ## layout for signal plot layout(array(c(1,1,1,2,4,4,1,1,1,2,4,4,5,5,6,6,3,3), c(6,3))) ## plot BF across region and recombination rates plot.signal(gc.results, rates.file = rates.file, focal.posn = focal.posn, plot.title = plot.title, plot.3.mut = plot.3.mut, impute.data = impute.data, snptest.data = snptest.data, xlim = c(min.location, max.location)) cat("Running GC3 to find best fitting mutations\n") detail.file.prefix = run.analyse.wtccc(control.gc2.files= control.gc2.files, case.gc2.files = case.gc2.files, focal.posn = focal.posn, temp.dir = temp.dir, rp = rp, run.gc3 = run.gc3, run.3.mut = run.3.mut, tree.file = tree.file) cat("\nThe result at the focal position is:\n") gc.results.focal.posn = read.table(detail.file.prefix, header = T) print(gc.results.focal.posn) write.table(gc.results.focal.posn, append = T, log.file, quote = F, row.names = F) ## read details of the best fitting mutations details = list() detail.file = paste(detail.file.prefix, "1_mut.best_mut", sep=".") details[[1]] = read.table(detail.file, as.is = T, header = T)[1,] detail.file = paste(detail.file.prefix, "2_mut.best_mut", sep=".") details[[2]] = read.table(detail.file, as.is = T, header = T)[1,] if(run.3.mut){ detail.file = paste(detail.file.prefix, "3_mut.best_mut", sep=".") details[[3]] = read.table(detail.file, as.is = T, header = T)[1,] } ## best fitting mutations mutations = list() for(i in 1:length(details)){ mutations[[i]] = details[[i]][1, 3:(2+i)] } ## expected haplotype counts counts = list() for(i in 1:2){ counts[[i]] = details[[i]][1, (i+3):(i+4+2*(i+1))] } cat("\nPlotting tree ... ") ## plot tree with best fitting mutations plot.order = plot.tree.file(tree.file = tree.file, focal.posn = focal.posn, mutations) cat("done\n") cat("Plotting panel\n") ## plot panel haplotypes panel = read.table(panel.file, as.is = T) legend = read.table(legend.file, as.is = T, header = T) ## R stupidly reads "T" as "TRUE" so just switching it back legend[,3] = as.character(legend[,3]) legend[legend[,3] == "TRUE",3] = "T" legend[,4] = as.character(legend[,4]) legend[legend[,4] == "TRUE",4] = "T" plot.panel(panel = panel, legend = legend, low = min.location, high = max.location, label.snps = label.snps, log.file = log.file, plot.order = plot.order, colour.snps = colour.snps, focal.posn = focal.posn, rs.ids = rs.ids, extra.cluster = extra.cluster) cat("\nPlotting contingency table ... ") ## plot contingency table with the expected haplotype counts plot.counts(counts = counts, bf1 = gc.results[which(gc.results[,"location"] == focal.posn), "mut1"], bf2 = gc.results[which(gc.results[,"location"] == focal.posn), "mut2"]) cat("done\n") ## find the SNP that best corresponds to the best fitting mutations best.part.files = paste(detail.file.prefix, c("1_mut.best_part", "2_mut.best_part"), sep=".") if(run.3.mut){ best.part.files = c(best.part.files, paste(detail.file.prefix, "3_mut.best_part", sep=".")) } find.best.fitting.snps(panel = panel, legend = legend, best.part.files = best.part.files, log.file = log.file, low = min.location, high = max.location, r2.thresh = 0.8, rs.ids = rs.ids) if(delete.temp.files){ cat("\nRemoving temporary files ... ") unlink(temp.dir, recursive = T) cat("done\n") } else{ cat("you chosen to keep the temporary files kept in", temp.dir, "\n") } cat("\n\nGENECLUSTER make.plot is complete!!\n") cat("Have a nice day!\n") } ############################################################################################################## ## Functions for plotting the GENECLUSTER BF and the corresponding ## recombination rates across a region plot.signal <- function(gc.data = NULL, snptest.data = NULL, impute.data = NULL, focal.posn = NULL, logBF.thresh=4, rates.file = NULL, ylim = c(-5, 22), plot.title = "", plot.3.mut = FALSE, xlim) { if(is.null(rates.file) | is.null(focal.posn)){ stop("error in plot.signal: focal position or rates file not supplied") } ## read recombination rates genetic.map <- read.hapmap.genetic.map(rates.file) ## find positions psns <- gc.data[,"location"] ## find upper and lower boundaries low <- min(gc.data[,"location"]) high <- max(gc.data[,"location"]) yl <- "log10 Bayes Factor" ## plot the signal as a function of chromosomal position; op <- par(mar=c(1,4,4,4)) ; on.exit(par(op)) #print("plotting signal") plot.default(x=gc.data[,"location"]/10^6, bty="n", xlab="", xaxt="n", ylab=yl, ylim=ylim, main=plot.title, type="n", xlim = xlim/10^6) abline(h=4, col="grey", lty=1, lwd = 1) tests = list.gc.tests(plot.3.mut = plot.3.mut)[1,] test.labels = list.gc.tests(plot.3.mut = plot.3.mut)[2,] colours = array(rgb(t(col2rgb(2:(length(tests)+1))), maxColorValue = 255)) rownames(colours) = tests cat("\nplotting signal ... ") if(!is.null(snptest.data)){ test.labels = c(test.labels, "typed") colours = c(colours, "black") points(x=snptest.data[,1]/10^6, y=snptest.data[,2], col= rgb(t(col2rgb("black")), maxColorValue = 255), pch=19) } if(!is.null(impute.data)){ test.labels = c(test.labels, "imputed") colours = c(colours, "grey") points(x=impute.data[,1]/10^6, y=impute.data[,2], col= rgb(t(col2rgb("grey")), maxColorValue = 255), pch=19) } for(test in tests){ points(x=gc.data[,"location"]/10^6, y=log10(gc.data[,test]), col= colours[test], pch=19) } legend(low/10^6, 16, legend=test.labels,col=colours, pch = 19, xjust=0, yjust=0) cat("done\n") cat("reading genetic map ... ") par(mar=c(4,4,0,4)) in.window <- genetic.map$position > low & genetic.map$position < high genetic.map <- genetic.map[in.window,] #print("done") cat("done\n") ## calculate cumulative genetic map cat("calculating cum map ... ") focal.idx <- which.min(abs(genetic.map$position - focal.posn)) cum.map <- abs(genetic.map$genetic.map - genetic.map$genetic.map[focal.idx]) cat("done\n") ## calculate genetic map #print("calculating genetic map") genetic.map$genetic.map[-1] <- diff(genetic.map$genetic.map) / diff(genetic.map$position) genetic.map$genetic.map[1] <- NA #print("done") ## formatting settings genmap.lwd <- 1 cex.mtext <- 0.75 axis.padj <- 0.5 axis.lab.line <- 3 cat("plotting recombination rates ... ") plot(x=genetic.map$position / 10^6, y=genetic.map$genetic.map * 10^6, ylim=c(0,90), type="s", bty="u", xlab="", xaxt="n", ylab="", yaxt="n", col="red", lwd=genmap.lwd, xlim = xlim/10^6) axis(side=2, at=c(0,40,80), labels=c("0","40","80"), padj=axis.padj, cex.axis=1) mtext(text="cM/Mb", side=2, line=axis.lab.line, cex=cex.mtext) #print("done") #print("plotting cum rate") par(new=T) plot(x=genetic.map$position / 10^6, y=cum.map, ylim=c(0,2), type="l", lty=1, bty="n", col="purple", xlab="", xaxt="n", yaxt="n", ylab="", lwd=genmap.lwd, xlim = xlim/10^6) axis(side=4, padj=-1*axis.padj) mtext("cM from hit SNP", side=4, line=axis.lab.line, cex=cex.mtext) axis(side=1) mtext(text="Chromosomal position (Mb)", side=1, line=axis.lab.line, adj=0.5, cex=cex.mtext) cat("done\n\n") } read.hapmap.genetic.map <- function(file) { ## Rate columns are in cM / Mb, genetic_map column is in cM what <- c( list(numeric()), list(NULL), list(numeric()) ) df <- as.data.frame(scan(file, what=what, skip=1, quiet=T, nlines=wc(file, lines.only=TRUE) - 1)[c(1,3)]) colnames(df) <- c("position","genetic.map") df } # word count (like the unix command "wc") wc <- function(f, lines.only=FALSE) { if (lines.only) structure(as.integer(strsplit(system(paste("wc -l", f), intern=TRUE), split="\\s+")[[1]][1]), names="lines") else structure(as.integer(strsplit(system(paste("wc", f), intern=TRUE), split="\\s+")[[1]][c(2,3,4)]), names=c("lines","words","bytes")) } ############################################################################################################## ## Functions for plotting a tree across a region and marking on the ## best fitting mutations plot.tree.file<-function(tree.file = NULL, focal.posn = NULL, mutations = NULL) { if(is.null(tree.file) | is.null(focal.posn)){ stop("error in plot.tree.file: focal position or tree directory not supplied") } ## find the line with the tree data for the focal position cat(" processing tree file.") tree.file.split = unlist(strsplit(tree.file, ".", fixed = T)) if(tree.file.split[length(tree.file.split)] == "gz"){ tree.con = gzfile(tree.file, open = "r") } else{ tree.con = file(tree.file, open = "r") } cat("1") tree.lines = readLines(tree.con) close(tree.con) cat("3") tree.positions = as.numeric(apply(matrix(tree.lines, ncol = 1), 1, function(x){return(unlist(strsplit(x," "))[1])})) cat("4") line.with.focal.tree = which(tree.positions == focal.posn) cat("5") cat(", finding tree at focal pos...") if(length(line.with.focal.tree)==0){ cat("ERROR: tree at focal position not found\n") stop(0) } tree.lines = tree.lines[line.with.focal.tree] ## plot first tree in the file cat("parsing tree...") tree = strsplit(tree.lines, "; ") ## n.nodes is the number of leaves n.nodes = as.numeric(tree[[1]][2]) ## strip away any ; and , tree = strsplit(tree[[1]][3], ";") tree = strsplit(tree[[1]][1], ", ") ## format the tree data into a table with colomns ## node time child1 child2 mutation tree.data = array(0, c(2*n.nodes-1,5)) colnames(tree.data) = c("node", "time", "child1", "child2", "mutation") for(i in 1:n.nodes){ tree.data[i,"node"] = i } cat("adjusting times...") ## adjust the branch lengths (time colomn) so that they can be seen ## clearly in plot time = 0 for(nodes.left in n.nodes:2){ time = time + 2.0/(nodes.left) tree.data[2*n.nodes-nodes.left+1,"time"] = time tree.data[2*n.nodes-nodes.left+1, "node"] = 2*n.nodes-nodes.left+1 children = strsplit(tree[[1]][n.nodes-nodes.left+1], " ") tree.data[2*n.nodes-nodes.left+1,"child1"] = as.numeric(children[[1]][1]) tree.data[2*n.nodes-nodes.left+1,"child2"] = as.numeric(children[[1]][2]) } cat("getting mutations...") ## assign a number, mut>0, in the mutation column if there is a ## mutation at a given node ## mutations[[mut]][1, ] specifies which mutation model the mutation is a best fit for ## mutations[[mut]][2, ] specifies the order of each mutation (used for colour coding of the contingency table in top right panel) counter = 0 mutations.data = list() if(length(mutations)>=1) { #Niall edit, to cope with zero mutations for(i in 1:length(mutations)){ for(j in 1:length(mutations[[i]])){ mutant.node = as.numeric(mutations[[i]][j])+1 mut = tree.data[mutant.node, "mutation"] if(mut == 0){ counter = counter + 1 tree.data[mutant.node, "mutation"] = counter mutations.data[[counter]] = array(0, c(2,0)) mutations.data[[counter]] = cbind(mutations.data[[counter]], c(i,j)) } else{ mutations.data[[mut]] = cbind(mutations.data[[mut]], c(i,j)) } } } } ## plot tree plot.order = plot.tree.horizontal(data = tree.data, mutations = mutations.data, names.plot=F) return(plot.order) } ######################################################### # Define order of haplotypes underneath the plotted tree (adapted from # code by Gil McVean) ######################################################### order.seqs<-function(data, node=nrow(data), ord=c()) { if (data[node,"child1"]==0) { ord[length(ord)+1]<-node; } else { ord<-order.seqs(data=data, node=data[node, "child1"], ord); ord<-order.seqs(data=data, node=data[node, "child2"], ord); } return(ord); } ######################################################### # To plot a tree horizontally (adapted from code by Gil McVean) # Data is # Col1 = Node # Col2 = Time # COl3 = D1 # COl4 = D2 # Col5 = Mutation status ######################################################### plot.tree.horizontal <- function(data, maxdepth=-1, mutations=NULL, x.low=0, names.plot=TRUE, lwd=1) { symbols = c(19, 19, 24) symbol.sizes = c(2.0, 1.0, 1.0) nseq<-(nrow(data)+1)/2; mrca = nrow(data); plot.order<-order.seqs(data); if(maxdepth<0){ maxdepth=max(data[,"time"]) } par(mar=c(4,0,0,4)) plot(0,0, type="n", yaxt="n", ylab="", xaxt="n", xlab="", bty="n", xlim=c(x.low,maxdepth), ylim=c(0,nseq+1)); branch.pos<-order(plot.order); for(node in (nseq+1):mrca) { branch.pos[node]<-(branch.pos[data[node,"child1"]]+branch.pos[data[node,"child2"]])/2; #DO vertical line segments(y0=branch.pos[data[node,"child1"]], x0=data[node,"time"], y1=branch.pos[data[node,"child2"]], x1=data[node, "time"], lwd=lwd); #DO LH horizontal segments(y0=branch.pos[data[node,"child1"]], x0=data[data[node,"child1"],"time"], y1=branch.pos[data[node,"child1"]], x1=data[node, "time"], lwd=lwd); if (data[data[node,"child1"],"mutation"]>0) { time.span<-c(data[data[node,"child1"],"time"], data[node,"time"]); m = data[data[node,"child1"],"mutation"] for(n.mut in 1:3){ if(n.mut %in% mutations[[m]][1,]){ c.index = mutations[[m]][2, which(mutations[[m]][1,] == n.mut)] y.vals<-time.span[1]+(n.mut*2/7)*(time.span[2]-time.span[1]) points(y.vals, branch.pos[data[node,"child1"]], pch=symbols[n.mut], col=mutation.colours()[[n.mut]][c.index], cex=symbol.sizes[n.mut], bg = mutation.colours()[[n.mut]][c.index]) } } } #DO RH horizontal segments(y0=branch.pos[data[node,"child2"]], x0=data[data[node,"child2"],"time"], y1=branch.pos[data[node,"child2"]], x1=data[node, "time"], lwd=lwd); if (data[data[node,"child2"],"mutation"]>0) { time.span<-c(data[data[node,"child2"],"time"], data[node,"time"]); m = data[data[node,"child2"],"mutation"] for(n.mut in 1:3){ if(n.mut %in% mutations[[m]][1,]){ c.index = mutations[[m]][2, which(mutations[[m]][1,] == n.mut)] y.vals<-time.span[1]+(n.mut*2/7)*(time.span[2]-time.span[1]) points(y.vals, branch.pos[data[node,"child2"]], pch=symbols[n.mut], col=mutation.colours()[[n.mut]][c.index], cex=symbol.sizes[n.mut], bg = mutation.colours()[[n.mut]][c.index]) } } } } if (names.plot==TRUE) { text(y=c(1:nseq), x=rep(0, nseq), labels=data[plot.order,"node"], srt=0, cex=0.5, pos=2) } return(plot.order) } ########################################################################################### ## functions for plotting a panel in the bottom left plot.panel <- function(panel = NULL, legend = NULL, low = NULL, high = NULL, label.snps = NULL, log.file = NULL, plot.order = NULL, colour.snps = NULL, rs.ids = NULL, extra.cluster = NULL, focal.posn = NULL) { if(is.null(panel) | is.null(legend) | is.null(low) | is.null(high) | is.null(plot.order) | is.null(focal.posn)){ stop("error in plot.panel: panel file or legend file or boundaries of region of interest not supplied") } if(!is.null(log.file)){ log.file.con = file(log.file, open = "a") } ## plot reference panel f = legend[, 2] >= low & legend[, 2] <= high haps.local = panel[f, ] legend.local = legend[f, ] snps.pos = legend[f, 2] haps.local = haps.local[, plot.order] par(mar=c(4.2,5,0.2,5)) ## colouring the haplotypes if rs.ids are supplied rs.index = c() if(!is.null(rs.ids)){ ## find the rs.ids that are in the legend files for(rs.id in rs.ids){ if(rs.id %in% legend[, "rs"]){ rs.index = c(rs.index, which(legend[, "rs"] == rs.id)) } } cat("The rsids that determine the colours of each haplotype are:\n") if(isOpen(log.file.con)){ writeLines("\nThe rsids that determine the colours of each haplotype are:", con = log.file.con, sep="\n") } for(i in rs.index){ print(legend[i,]) if(isOpen(log.file.con)){ write.table(legend[i,], file = log.file.con, append = TRUE, quote = FALSE) } } } if(!is.null(extra.cluster)){ cat("You have used the extra.cluster function, so these haplotypes will be colour differently:\n") for(i in 1:length(extra.cluster)){ cat("group", i, "\n") cat(extra.cluster[[i]], "\n") } if(isOpen(log.file.con)){ writeLines("\nYou have used the extra.cluster function, so these haplotypes will be colour differently:", con = log.file.con, sep="\n") for(i in 1:length(extra.cluster)){ writeLines(paste("group", i), con = log.file.con, sep="\n") writeLines(paste(extra.cluster[[i]], collapse = ", "), con = log.file.con, sep="\n") } } } ## assign a colour for each unique combination of alleles at the ## SNPs with the rs ids if(!is.null(extra.cluster) | !is.null(rs.ids)){ cat("The colour codes are:\n") if(isOpen(log.file.con)){ writeLines("\nThe colour codes are:", con = log.file.con, sep="\n") } for(i in dim(panel)[2]:1){ rsid.alleles = "" code = length(extra.cluster) + 1 if(!is.null(rs.ids)){ rsid.alleles = "(" for(j in 1:length(rs.index)){ if(panel[rs.index[j], plot.order[i]] == 1){ rsid.alleles = paste(rsid.alleles, legend[rs.index[j],4], sep="") code = code + 2^(j-1) } else{ rsid.alleles = paste(rsid.alleles, legend[rs.index[j],3], sep="") } } rsid.alleles = paste(rsid.alleles, ")", sep="") } if(!is.null(extra.cluster)){ for(j in 1:length(extra.cluster)){ if((plot.order[i] %in% extra.cluster[[j]])){ code = j } } } print(paste("panel haplotype", plot.order[i], rsid.alleles, "has colour code", code)) if(isOpen(log.file.con)){ writeLines(paste("panel haplotype", plot.order[i], rsid.alleles, "has colour code", code), con = log.file.con, sep="\n") } haps.local[, i] = code*haps.local[, i] } } ncol = 2^length(rs.index)+length(extra.cluster) if(ncol == 1){ col = c("white", "red") } else{ col = c("white", rainbow(ncol)) } if(!is.null(colour.snps)){ col = c(col, "grey") haps.local[which(!(legend.local[, 1] %in% colour.snps)), ] = (length(col)-1)*panel[f, plot.order] } ## plot the panel image(x = snps.pos/10^6, y = 1:dim(haps.local)[2], z = as.matrix(haps.local), bty="n", yaxt="n", ylab = "", xlab = "", ylim=c(-5,dim(haps.local)[2]+6), col = col, xlim = c(low, high)/10^6) ## plot the locations of the rs ids if(!is.null(label.snps)){ for(snp in label.snps){ abline(v=legend[which(legend[,1] == snp), 2]/10^6,col="brown", lty=2, lwd=2) } } ## plot the location of the focal position abline(v=focal.posn/10^6,col="blue", lty=2, lwd=2) if(isOpen(log.file.con)){ close(log.file.con) } } ######################################################################################## ## run ANALYZE_MATRIX with -detail option to find best fitting mutations run.analyse.wtccc <- function(focal.posn, tree.file = NULL, control.gc2.files = NULL, case.gc2.files = NULL, run.gc3 = T, temp.dir = NULL, rp = NULL, run.3.mut = FALSE) { res.file = file.path(temp.dir, "result.gs") if(run.gc3){ command = paste(analyse.executable(), "-controls", paste(control.gc2.files, collapse = " "), "-cases", paste(case.gc2.files, collapse = " "), "-tree_file", tree.file, "-o", res.file, "-int", focal.posn, focal.posn, "-detail" ) if(!is.null(rp)){ command = paste(command, "-rprior", rp[1], rp[2]) } if(run.3.mut){ command = paste(command, "-mutation_models 1.0 1.0 1.0") } else{ command = paste(command, "-mutation_models 1.0 1.0 0.0") } system(command) } return(res.file) } ################################################################################### ## plot the contingency table for the expected haplotype counts, for ## the best fitting 1-mutation and 2-mutation models, in the upper ## right panel plot.counts <- function(counts = NULL, bf1 = NULL, bf2 = NULL) { ## for 1-mutation model par(mar=c(0,2,4,2)) plot(x = 0, xlim = c(0,1), ylim = c(0,1), xlab = "", ylab = "", main = "1-mutation model", xaxt = "n", yaxt = "n", type = "n", bty = "n") controls.count = c(as.numeric(counts[[1]][3]), as.numeric(counts[[1]][5])) cases.count = c(as.numeric(counts[[1]][4]), as.numeric(counts[[1]][6])) rr = c(controls.count[2]*cases.count[1]/(cases.count[2]*controls.count[1])) text(x = 0.3, y = c(0.8, 0.65, 0.5), labels = c("controls", "cases", "relative risk")) text(x = 0.5, y = c(0.95, 0.8, 0.65, 0.5), col = mutation.colours()[[1]][1], labels = c("group 1", format(c(2*controls.count[1], 2*cases.count[1], rr[1]), digits=2, nsmall=2))) text(x = 0.7, y = c(0.95, 0.8, 0.65, 0.5), labels = c("group 2",format(c(2*controls.count[2], 2*cases.count[2]), digits=2, nsmall=2), "-")) text(x = c(0.3, 0.7), y = 0.2, labels = c("log10 BF at focal position", format(log10(bf1), digits=2, nsmall=2)), font = 2) ## for 2-mutation model par(mar=c(2,2,2,2)) plot(x = 0, xlim = c(0,1), ylim = c(0,1), xlab = "", ylab = "", main = "2-mutation model", xaxt = "n", yaxt = "n", type = "n", bty = "n") controls.count = c(as.numeric(counts[[2]][3]), as.numeric(counts[[2]][5]), as.numeric(counts[[2]][7])) cases.count = c(as.numeric(counts[[2]][4]), as.numeric(counts[[2]][6]), as.numeric(counts[[2]][8])) rr = c(controls.count[3]*cases.count[1]/(cases.count[3]*controls.count[1]), controls.count[3]*cases.count[2]/(cases.count[3]*controls.count[2])) text(x = 0.2, y = c(0.8, 0.65, 0.5 ), labels = c("controls", "cases", "relative risk")) text(x = 0.4, y = c(0.95, 0.8, 0.65, 0.5), col = mutation.colours()[[2]][1], labels = c("group 1", format(c(2*controls.count[1], 2*cases.count[1], rr[1]), digits=2, nsmall=2))) text(x = 0.6, y = c(0.95, 0.8, 0.65, 0.5), col = mutation.colours()[[2]][2], labels = c("group 2", format(c(2*controls.count[2], 2*cases.count[2], rr[2]), digits=2, nsmall=2))) text(x = 0.8, y = c(0.95, 0.8, 0.65, 0.5), labels = c("group 3", format(c(2*controls.count[3], 2*cases.count[3]), digits=2, nsmall=2), "-")) text(x = c(0.3, 0.7), y = 0.2, labels = c("log10 BF at focal position", format(log10(bf2), digits=2, nsmall=2)), font = 2) } ################################################################################################################################## find.best.fitting.snps <- function(panel, legend, best.part.files, log.file = NULL, low, high, r2.thresh = 0.8, rs.ids = NULL){ if(!is.null(log.file)){ log.file.con = file(log.file, open = "at") } f = legend[, 2] >= low & legend[, 2] <= high haps.local = panel[f, ] legend.local = legend[f, ] for(i in 1:length(best.part.files)){ f.con = file(best.part.files[i]) best.part = readLines(f.con) close(f.con) for(j in 1:(i+1)){ snp = rep(0, dim(panel)[2]) snp[as.numeric(unlist(strsplit(best.part[1+2*j], " ")))+1] = 1 r = cor(snp, t(haps.local)) reported.snps = which(r^2>r2.thresh | legend.local[,1] %in% rs.ids) if(j>i){ msg = paste("\nThe haplotypes in the reference panel file (indexed from 1) carrying the best mutation ", j, " under the ", i, "-mutation model (represented by a lack of a ", mutation.symbols()[i], ") are:\n", sep="") msg = paste(msg, paste(as.numeric(unlist(strsplit(best.part[1+2*j], " ")))+1, collapse = " "), "\n") msg = paste(msg, "\nThe SNPs highly correlated with best mutation ", j, " under the ", i, "-mutation model (represented by a lack of a ", mutation.symbols()[i], ") are:\n", sep="") } else{ msg = paste("\nThe haplotypes in the reference panel file (indexed from 1) carrying the best mutation ", j, " under the ", i, "-mutation model (represented by a ", mutation.colours()[[i]][j], " ", mutation.symbols()[i], ") are:\n", sep="") msg = paste(msg, paste(as.numeric(unlist(strsplit(best.part[1+2*j], " ")))+1, collapse = " "), "\n") msg = paste(msg, "\nThe SNPs highly correlated with best mutation ", j, " under the ", i, "-mutation model (represented by a ", mutation.colours()[[i]][j], " ", mutation.symbols()[i], ") are:\n", sep="") } cat(msg) if(isOpen(log.file.con)){ writeLines(msg, con = log.file.con, sep="\n") flush(log.file.con) } best.snps = cbind(legend.local[reported.snps,], r[reported.snps]) colnames(best.snps) = c("rsid", "position", "allele0", "allele1", "r") if(nrow(best.snps) > 0){ print(best.snps) } else { cat("no SNPs found\n") } if(!is.null(log.file)){ if(nrow(best.snps) > 0){ write.table(best.snps, file = log.file, append = T, quote = F) } else{ writeLines("no SNPs found", con = log.file.con, sep="\n") flush(log.file.con) } } } } if(isOpen(log.file.con)){ close(log.file.con) } }