This document is the reproducible code for PMDDA paper with extra comparison with other workflow. To access standalone PMDDA workflow, please install rmwf package from GitHub and use rmarkdown::draft("test.Rmd", template = "PMDDA", package = "rmwf") to create a reproducible workflow document for your own studies. Such workflow is also available through xcmsrocker project.

The core of PMDDA workflow include the following steps:

MS1 data collection

The MS1 full scan data used in this study could be accessed from figshare and the details are given as comments in this chunk. Those data include five NIST1950 samples and five matrix samples from positive and negative mode, respectively. Those data are also available on GitHub.

options(timeout=600)
# Download data using rfigshare package
library(rfigshare)
# create data folder for MS1 positive data
dir.create(file.path('data'), showWarnings = FALSE)
setwd(file.path('data'))

# download NIST1950 MS1 positive data into nist1950 folder
dir.create(file.path('nist1950'), showWarnings = FALSE)
setwd(file.path('nist1950'))
article_id <- 13252196
details <- fs_details(article_id, mine = FALSE, session = NULL)
name <- sapply(details$files,function(x) x$name)
url <- sapply(details$files,function(x) x$download_url)
download.file(url, name, method = 'libcurl')

# download matrix MS1 positive data into matrix folder
setwd('..')
dir.create(file.path('matrix'), showWarnings = FALSE)
setwd(file.path('matrix'))
article_id <- 13252244
details <- fs_details(article_id, mine = FALSE, session = NULL)
name <- sapply(details$files,function(x) x$name)
url <- sapply(details$files,function(x) x$download_url)
download.file(url, name, method = 'libcurl')
setwd('..')
path <- getwd()
# get the files name of positive MS1 data
files <- list.files(path,pattern = '*.mzML',recursive = T,full.names = T)
setwd('..')

# create datan folder for MS1 negative data
dir.create(file.path('datan'), showWarnings = FALSE)
setwd(file.path('datan'))

# download NIST1950 MS1 negative data into nist1950n folder
dir.create(file.path('nist1950n'), showWarnings = FALSE)
setwd(file.path('nist1950n'))
article_id <- 13252274
details <- fs_details(article_id, mine = FALSE, session = NULL)
name <- sapply(details$files,function(x) x$name)
url <- sapply(details$files,function(x) x$download_url)
download.file(url, name, method = 'libcurl')
setwd('..')

# download matrix MS1 negative data into matrixn folder
dir.create(file.path('matrixn'), showWarnings = FALSE)
setwd(file.path('matrixn'))
article_id <- 13252355
details <- fs_details(article_id, mine = FALSE, session = NULL)
name <- sapply(details$files,function(x) x$name)
url <- sapply(details$files,function(x) x$download_url)
download.file(url, name, method = 'libcurl')
setwd('..')
path <- getwd()
filesn <- list.files(path,pattern = '*.mzML',recursive = T,full.names = T)
setwd('..')

Optimization of XCMS

This chunk should run first to generate XCMS parameters. We suggest a computer with large memory to run this chunk for IPO package.

IPO

This chunk will generate ‘para.RData’(positive) and ‘paran.RData’ (negative) to save the optimized parameters for xcms peak picking. For the usage of IPO package, check here for technique details.

path <- 'data/nist1950'
files <- list.files(path,pattern = '*.mzML',recursive = T,full.names = T)
library(IPO)
library(xcms)
peakpickingParameters <- getDefaultXcmsSetStartingParams('centWave')
# Uncomment this line to use your own data(suggested 3-5 pooled QC samples)
# path <- 'path/to/your/files'
# change to 5 for obitrap
peakpickingParameters$ppm <- 10
resultPeakpicking <- 
  optimizeXcmsSet(files = files[c(1,3,5)], 
                  params = peakpickingParameters,
                  plot = F,
                  subdir = NULL)

optimizedXcmsSetObject <- resultPeakpicking$best_settings$xset
retcorGroupParameters <- getDefaultRetGroupStartingParams()
retcorGroupParameters$minfrac <- 1
resultRetcorGroup <-
  optimizeRetGroup(xset = optimizedXcmsSetObject, 
                   params = retcorGroupParameters, 
                   plot = F,
                   subdir = NULL)
para <- c(resultPeakpicking$best_settings$parameters, 
          resultRetcorGroup$best_settings)
save(para,file = 'para.RData')

path <- 'datan/nist1950n'
files <- list.files(path,pattern = '*.mzML',recursive = T,full.names = T)
library(IPO)
library(xcms)
peakpickingParameters <- getDefaultXcmsSetStartingParams('centWave')
# Uncomment this line to use your own data(suggested 3-5 pooled QC samples)
# path <- 'path/to/your/files'
# change to 5 for obitrap
peakpickingParameters$ppm <- 10
resultPeakpicking <- 
  optimizeXcmsSet(files = files[c(1,3,5)], 
                  params = peakpickingParameters,
                  plot = F,
                  subdir = NULL)

optimizedXcmsSetObject <- resultPeakpicking$best_settings$xset
retcorGroupParameters <- getDefaultRetGroupStartingParams()
retcorGroupParameters$minfrac <- 1
resultRetcorGroup <-
  optimizeRetGroup(xset = optimizedXcmsSetObject, 
                   params = retcorGroupParameters, 
                   plot = F,
                   subdir = NULL)
para <- c(resultPeakpicking$best_settings$parameters, 
          resultRetcorGroup$best_settings)
save(para,file = 'paran.RData')

Wrap function for peak picking

This chunk could be run after you have optimized IPO parameters from last chunk to load the parameters as function. Here we use xcms 3 for peak picking and check here for the technique details of their parameters.

library(xcms)
# here we use pre-optimized IPO parameters
# Or you could load your own set from last chunk
# Positive mode
para <- readRDS('para.RDS')
getrtmz <- function(path,index = NULL){
  files <- list.files(path,pattern = ".CDF|.mzXML|.mzML",full.names = T,recursive = T)
  if(!is.null(index)){
    files<- files[index]
    }
  group <- xcms:::phenoDataFromPaths(files)
  if(NCOL(group)==1){
      sample_group <- group$class
  }else{
      cols <- colnames(group)
      sample_group <-  do.call(paste,c(group[cols],sep='_'))
  }
  sample_name=sub(basename(files),pattern = ".CDF|.mzXML|.mzML",replacement = '')
  pd <- cbind.data.frame(sample_name, sample_group)
  
  raw_data <- readMSData(files = files, pdata = new("NAnnotatedDataFrame", pd),
                       mode = "onDisk")
  # remove co-elution and column wash phase
  # filter_data <- filterRt(raw_data,rt = c(100,900))
  cwp <- CentWaveParam(peakwidth = c(para$min_peakwidth,para$max_peakwidth), 
                       ppm             = para$ppm,
                       noise           = para$noise,
                       snthresh        = para$snthresh,
                       mzdiff          = para$mzdiff,
                       prefilter       = c(para$prefilter,para$value_of_prefilter),
                       mzCenterFun     = para$mzCenterFun,
                       integrate       = para$integrate,
                       fitgauss        = para$fitgauss,
                       verboseColumns  = para$verbose.columns)
  owp <- ObiwarpParam(binSize        = para$profStep,
                      centerSample   = 6,
                      response       = para$response,
                      distFun        = para$distFunc,
                      gapInit        = para$gapInit,
                      gapExtend      = para$gapExtend,
                      factorDiag     = para$factorDiag,
                      factorGap      = para$factorGap,
                      localAlignment = ifelse(para$localAlignment==0,F,T))
  pdp <- PeakDensityParam(sampleGroups = pd$sample_group,
                        bw      = para$bw,
                        minFraction = para$minfrac,
                        minSamples = para$minsamp,
                        maxFeatures = para$max,
                        binSize = para$mzwid)
  MSnbase::setMSnbaseFastLoad(FALSE)
  xdata <- findChromPeaks(raw_data, param = cwp)
  # xdata <- findChromPeaks(filter_data, param = cwp)
  xdata <- adjustRtime(xdata, param = owp)
  xdata <- groupChromPeaks(xdata, param = pdp)
  xdata <- fillChromPeaks(xdata)
  return(xdata)
}
# Negative mode
paran <- readRDS('paran.RDS')
getrtmzn <- function(path,index = NULL){
  files <- list.files(path,pattern = ".CDF|.mzXML|.mzML",full.names = T,recursive = T)
  if(!is.null(index)){
    files<- files[index]
    }
  group <- xcms:::phenoDataFromPaths(files)
  if(NCOL(group)==1){
      sample_group <- group$class
  }else{
      cols <- colnames(group)
      sample_group <-  do.call(paste,c(group[cols],sep='_'))
  }
  sample_name=sub(basename(files),pattern = ".CDF|.mzXML|.mzML",replacement = '')
  pd <- cbind.data.frame(sample_name, sample_group)
  
  raw_data <- readMSData(files = files, pdata = new("NAnnotatedDataFrame", pd),
                       mode = "onDisk")
  # remove co-elution and column wash phase
  # filter_data <- filterRt(raw_data,rt = c(100,900))
  cwp <- CentWaveParam(peakwidth = c(para$min_peakwidth,para$max_peakwidth), 
                       ppm             = paran$ppm,
                       noise           = paran$noise,
                       snthresh        = paran$snthresh,
                       mzdiff          = paran$mzdiff,
                       prefilter       = c(para$prefilter,para$value_of_prefilter),
                       mzCenterFun     = paran$mzCenterFun,
                       integrate       = paran$integrate,
                       fitgauss        = paran$fitgauss,
                       verboseColumns  = paran$verbose.columns)
  owp <- ObiwarpParam(binSize        = paran$profStep,
                      centerSample   = 6,
                      response       = paran$response,
                      distFun        = paran$distFunc,
                      gapInit        = paran$gapInit,
                      gapExtend      = paran$gapExtend,
                      factorDiag     = paran$factorDiag,
                      factorGap      = paran$factorGap,
                      localAlignment = ifelse(paran$localAlignment==0,F,T))
  pdp <- PeakDensityParam(sampleGroups = pd$sample_group,
                        bw      = paran$bw,
                        minFraction = paran$minfrac,
                        minSamples = paran$minsamp,
                        maxFeatures = paran$max,
                        binSize = paran$mzwid)
  MSnbase::setMSnbaseFastLoad(FALSE)
  xdata <- findChromPeaks(raw_data, param = cwp)
  # xdata <- findChromPeaks(filter_data, param = cwp)
  xdata <- adjustRtime(xdata, param = owp)
  xdata <- groupChromPeaks(xdata, param = pdp)
  xdata <- fillChromPeaks(xdata)
  return(xdata)
}

Peaks list

This section is used to generate peaks list and related csv files for further analysis.

Peak picking

This chunk will use the optimized xcms parameters for peak picking. This chunk will generate srmmzrt.csv, srmnmzrt.csv to save the peaks list from positive and negative mode MS1 full scan. The code will also save corresponding xcmsSet object for RAMClustR and CAMERA package.

# use your own data by changing the data path
# Positive mode
path <- 'data/'
srm <- getrtmz(path,index = c(1:10))
# back up peak list as csv file and xcmsSet object
mzrt <- enviGCMS::getmzrt(srm, name = 'srm')
# Negative mode
path <- 'datan/'
srmn <- getrtmzn(path,index = c(1:10))
# back up peak list as csv file and xcmsSet object
mzrtn <- enviGCMS::getmzrt(srmn, name = 'srmn')

Peak filtering

This chunk will use peak list csv file as input to perform peak filtering based on matrix sample and RSD% cutoff.

This chunk will save the filtered peaks list as rppmzrt.csv and rpnmzrt.csv for positive and negative mode.

# Positive data
mzrt <- enviGCMS::getmzrtcsv('srmmzrt.csv')
# calculate group mean and rsd%
mzrtm <- enviGCMS::getdoe(mzrt)
gm <- mzrtm$groupmean
gr <- mzrtm$grouprsd
# find the blank group and pool QC group, demo data only have matrix blank
srm <- grepl('nist',colnames(gm))
blk <- grepl('matrix',colnames(gm))
# pqc <- grepl('pool',colnames(gm))
# filter by pool QC and blank's group mean intensity(pool QC should larger than three times of blank), return numbers and index
# in demo data, use sample average intensity for each peak
sum(indexmean <- apply(gm,1,function(x) all(x[srm]> 3*x[blk])))
# filter by pool qc rsd%, return numbers and index
# select rsd% cutoff 30
rsdcf <- 30
sum(indexrsd <- apply(gr,1,function(x) ifelse(is.na(x[srm]),T,x[srm]<rsdcf)))
# overlap with rsd% and mean filter
sum(index <- indexmean&indexrsd)
# 4711
qcindex <- grepl('matrix',mzrt$group$sample_group)
mzrtfilter <- enviGCMS::getfilter(mzrt,rowindex = index)
# save the filtered peak list
enviGCMS::getcsv(mzrtfilter,'rpp')
# Negative data
mzrt <- enviGCMS::getmzrtcsv('srmnmzrt.csv')
# calculate group mean and rsd%
mzrtm <- enviGCMS::getdoe(mzrt)
gm <- mzrtm$groupmean
gr <- mzrtm$grouprsd
# find the blank group and pool QC group, demo data only have matrix blank
srm <- grepl('nist',colnames(gm))
blk <- grepl('matrix',colnames(gm))
# pqc <- grepl('pool',colnames(gm))
# filter by pool QC and blank's group mean intensity(pool QC should larger than three times of blank), return numbers and index
# in demo data, use sample average intensity for each peak
sum(indexmean <- apply(gm,1,function(x) all(x[srm]> 3*x[blk])))
# filter by pool qc rsd%, return numbers and index
# select rsd% cutoff 30
rsdcf <- 30
sum(indexrsd <- apply(gr,1,function(x) ifelse(is.na(x[2]),T,x[2]<rsdcf)))
# overlap with rsd% and mean filter
sum(indexn <- indexmean&indexrsd)
# 3608
# new list, update group and remove pool qc/blk and save the new csv file
qcindex <- grepl('matrix',mzrt$group$sample_group)
mzrtfiltern <- enviGCMS::getfilter(mzrt,rowindex = indexn)
sum(index <- indexmean&indexrsd)
enviGCMS::getcsv(mzrtfiltern,'rpn')

Precursor selection

This section will select the precursor ions from MS1 full scan data for MS2 spectra data collection.

CAMERA

This chunk is the code to extract precursor ions list by CAMERA package. Noticing CAMERA is not designed for this purpose. This chunk will generate cameratar.csv and camerantar.csv to save the selected precursors from CAMERA package.

# load object for CAMERA
srmxset <- readRDS("srmxset.rds")
srmnxset <- readRDS("srmnxset.rds")
# CAMERA
# Positive mode
xsa <- CAMERA::annotate(srmxset, perfwhm=0.7, cor_eic_th=0.75,
ppm=10, polarity="positive")
peaklist <- CAMERA::getPeaklist(xsa)
peaklist$meanpeak <- apply(peaklist[,c(15:19)],1,mean)
length(unique(peaklist$pcgroup))
grouprt <- aggregate(peaklist$rt,by=list(peaklist$pcgroup), median)
grouprt$psgrp <- as.numeric(grouprt$Group.1)
grouprt2 <- merge(data.frame(xsa@annoGrp),grouprt,by='psgrp')
cre <- cbind.data.frame(mz=grouprt2$mass+1.0073,rt=grouprt2$x)
write.csv(cre,'cameratar.csv')
cre <- read.csv('cameratar.csv')
# 862

# Negative mode
xsa <- CAMERA::annotate(srmnxset, perfwhm=0.7, cor_eic_th=0.75,
ppm=10, polarity="negative")
peaklistn <- CAMERA::getPeaklist(xsa)
peaklistn$meanpeak <- apply(peaklistn[,c(15:19)],1,mean)
length(unique(peaklistn$pcgroup))
grouprt <- aggregate(peaklistn$rt,by=list(peaklistn$pcgroup), median)
grouprt$psgrp <- as.numeric(grouprt$Group.1)
grouprt2 <- merge(data.frame(xsa@annoGrp),grouprt,by='psgrp')
cren <- cbind.data.frame(mz=grouprt2$mass-1.0073,rt=grouprt2$x)
write.csv(cren,'camerantar.csv')
cren <- read.csv('camerantar.csv')
# 710

RAMClustR

This chunk is the code to extract precursor ions list by RAMClustR package. Noticing RAMClustR is not designed for this purpose. This chunk will generate ramclusttar.csv and ramclustntar.csv to save the selected precursors from RAMClustR package.

srmxset <- readRDS("srmxset.rds")
srmnxset <- readRDS("srmnxset.rds")
# Positive mode
rcp <- RAMClustR::ramclustR(srmxset)
RC <- RAMClustR::do.findmain(rcp, mode = "positive", mzabs.error = 0.02, ppm.error = 10)
meanpeak <- apply(t(rcp$MSdata),1,mean)
df <-  cbind.data.frame(mz = rcp$fmz, rt = rcp$frt, cluster = rcp$featclus,meanpeak)
mol <- data.frame(mz = RC$M+1.0073,rt=RC$clrt)
write.csv(mol,'ramclusttar.csv')
rre <- read.csv('ramclusttar.csv')
# 542

# Negative mode
rcp <- RAMClustR::ramclustR(srmnxset)
RC <- RAMClustR::do.findmain(rcp, mode = "negative", mzabs.error = 0.02, ppm.error = 10)
meanpeak <- apply(t(rcp$MSdata),1,mean)
dfn <-  cbind.data.frame(mz = rcp$fmz, rt = rcp$frt, cluster = rcp$featclus,meanpeak)
moln <- data.frame(mz = RC$M-1.0073,rt=RC$clrt)
write.csv(moln,'ramclustntar.csv')
rren <- read.csv('ramclustntar.csv')
# 770

pmd

This chunk is the code to extract precursor ions list by pmd package and GlobalStd algorithm. This chunk will generate pretar.csv and prentar.csv to save the selected precursors from pmd package.

# PMD
pos <- enviGCMS::getmzrtcsv('rppmzrt.csv')
neg <- enviGCMS::getmzrtcsv('rpnmzrt.csv')
# perform GlobalStd algorithm
pospmd <- pmd::globalstd(pos,sda=F,ng = NULL)
# 849
pospmd2 <- pmd::getcluster(pospmd,corcutoff = 0.9)
sum(pospmd2$stdmassindex2)
# 780
# generate the targeted ions list
pre <- cbind.data.frame(mz=pospmd2$mz[pospmd2$stdmassindex2],rt=pospmd2$rt[pospmd2$stdmassindex2])
write.csv(pre,'pretar.csv')
pre <- read.csv('pretar.csv')

negpmd <- pmd::globalstd(neg,sda=F,ng = NULL)
# 761
negpmd2 <- pmd::getcluster(negpmd,corcutoff = 0.9)
sum(negpmd2$stdmassindex2)
# 723
pren <- cbind.data.frame(mz=negpmd2$mz[negpmd2$stdmassindex2],rt=negpmd2$rt[negpmd2$stdmassindex2])
write.csv(pren,'prentar.csv')
pren <- read.csv('prentar.csv')
# 723

GlobalStd algorithm will not use predefined redundant peaks information and the unknown adducts can be found based on frequency in the data. This chunk will generate figure S4 and S5 for positive and negative mode data.

# PMD
pos <- enviGCMS::getmzrtcsv('rppmzrt.csv')
neg <- enviGCMS::getmzrtcsv('rpnmzrt.csv')

pospmd <- pmd::globalstd(pos,sda=F,ng = NULL)
## 98 retention time cluster found.
## 1987 paired masses found
## 37 unique within RT clusters high frequency PMD(s) used for further investigation.
## The unique within RT clusters high frequency PMD(s) is(are)  21.98 36 15.97 28 26.02 48 4.98 8.99 42.01 44.03 33.98 13.98 18.01 12 39.99 28.03 2.02 15.01 2.01 24 8.03 19.99 31.94 7.99 5.99 4.96 22.99 9.01 60.06 10.99 72.96 41.03 0.5 8.49 117.08 15.98 50.96.
## 2775 isotopologue(s) related paired mass found.
## 1769 multi-charger(s) related paired mass found.
## 0 retention group(s) have single peaks.
## 1 group(s) with multiple peaks while no isotope/paired relationship 95
## 7 group(s) with multiple peaks with isotope without paired relationship 7 18 21 40 42 46 98
## 2 group(s) with paired relationship without isotope 35 97
## 88 group(s) with paired relationship and isotope 1 2 3 4 5 6 8 9 10 11 12 13 14 15 16 17 19 20 22 23 24 25 26 27 28 29 30 31 32 33 34 36 37 38 39 41 43 44 45 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 96
## 849 std mass found.
# png('pospaired.png',width = 1500, height = 1500, res=300)
pmd::plotpaired(pospmd)

# dev.off()
negpmd <- pmd::globalstd(neg,sda=F,ng = NULL)
## 86 retention time cluster found.
## 1339 paired masses found
## 39 unique within RT clusters high frequency PMD(s) used for further investigation.
## The unique within RT clusters high frequency PMD(s) is(are)  12 26.02 43.99 49.98 67.99 94.01 24 48 104.04 18.01 42.01 30.01 54.01 10.03 27.02 58.03 78.01 16.99 145.93 2.02 61.97 77.94 15.97 6.02 31.94 41.97 21.98 129.96 99.92 87.97 146 46.01 135.97 203.96 78.02 139.99 147.93 22 36.05.
## 2288 isotopologue(s) related paired mass found.
## 485 multi-charger(s) related paired mass found.
## 0 retention group(s) have single peaks.
## 2 group(s) with multiple peaks while no isotope/paired relationship 10 11
## 8 group(s) with multiple peaks with isotope without paired relationship 5 7 16 25 34 38 65 68
## 0 group(s) with paired relationship without isotope
## 76 group(s) with paired relationship and isotope 1 2 3 4 6 8 9 12 13 14 15 17 18 19 20 21 22 23 24 26 27 28 29 30 31 32 33 35 36 37 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 66 67 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
## 761 std mass found.
# png('negpaired.png',width = 1500, height = 1500, res=300)
pmd::plotpaired(negpmd)