• 首页 首页 icon
  • 工具库 工具库 icon
    • IP查询 IP查询 icon
  • 内容库 内容库 icon
    • 快讯库 快讯库 icon
    • 精品库 精品库 icon
    • 问答库 问答库 icon
  • 更多 更多 icon
    • 服务条款 服务条款 icon

代谢组学 PCA PLS-DA OPLS-DA 在R语言的实现

武飞扬头像
Young.Dr
帮助1

主成分分析(Principal Component Analysis,PCA)是一种无监督降维方法,能够有效对高维数据进行处理。但PCA对相关性较小的变量不敏感,而PLS-DA(Partial Least Squares-Discriminant Analysis,偏最小二乘判别分析)能够有效解决这个问题。而OPLS-DA(正交偏最小二乘判别分析)结合了正交信号和PLS-DA来筛选差异变量。

图片来自:Orthogonal Partial Least Squares (OPLS) in R | R-bloggers

干货 | 1分钟看懂OPLS-DA原理及图表如题。敬请翻阅。学新通https://mp.weixin.qq.com/s?__biz=MzU2MzMzOTk1Mg==&mid=2247484881&idx=1&sn=a8b079763b17d620d11a96a62fa06c9d&chksm=fc5af20ecb2d7b18905a560e6f138902a7b1d0ae996d8444d34506e3af9267393fc89fed32e1&scene=178&cur_album_id=1762718371294330897#rd

安装和加载包

  1.  
    # install ropls
  2.  
    if (F) {
  3.  
    if (!requireNamespace("BiocManager", quietly = TRUE))
  4.  
    install.packages("BiocManager")
  5.  
     
  6.  
    BiocManager::install("ropls")
  7.  
    }
  8.  
     
  9.  
    # load packages
  10.  
    library(ropls)
  11.  
    library(ggplot2)
  12.  
    library(ggsci)
  13.  
    library(Cairo)
  14.  
    library(tidyverse)
  15.  
    library(extrafont)
  16.  
    loadfonts()

示例数据

复制成功

r

  1.  
    # load data
  2.  
    data(sacurine)
  3.  
    names(sacurine)
  4.  
     
  5.  
    # view data information
  6.  
    attach(sacurine)
  7.  
    strF(dataMatrix)
  8.  
    strF(sampleMetadata)
  9.  
    strF(variableMetadata)

这个数据集是不同年龄、性别和BMI的183个人的尿液中109种代谢物的浓度差异$^{[1]}$。下面的分析主要以性别为变量来研究不同性别人群尿液种代谢物的差异。下面的分析主要包含PCA、PLS-DA和OPLS-DA。

PCA分析

复制成功

r

  1.  
    # PCA analysis
  2.  
    pca = opls(dataMatrix)
  3.  
    genderFc = sampleMetadata[, "gender"]
  4.  
     
  5.  
    pdf(file = 'figures/PCA.pdf', width = 5, height = 5)
  6.  
    plot(pca, typeVc = "x-score",
  7.  
    parAsColFcVn = genderFc, parEllipsesL = TRUE)
  8.  
    dev.off()

可以看到的是如果用PCA的话,不同性别的人群是混在一起的。

PLS-DA

复制成功

r

  1.  
    # PLSDA analysis
  2.  
    plsda = opls(dataMatrix,genderFc)
  3.  
     
  4.  
    # sample scores plot
  5.  
    sample.score = plsda@scoreMN %>%
  6.  
    as.data.frame() %>%
  7.  
    mutate(gender = sacurine[["sampleMetadata"]][["gender"]])
  8.  
     
  9.  
    p1 = ggplot(sample.score, aes(p1, p2, color = gender))
  10.  
    geom_hline(yintercept = 0, linetype = 'dashed', size = 0.5)
  11.  
    geom_vline(xintercept = 0, linetype = 'dashed', size = 0.5)
  12.  
    geom_point()
  13.  
    geom_point(aes(-10,-10), color = 'white')
  14.  
    labs(x = 'P1(10.0%)',y = 'P2(9%)')
  15.  
    stat_ellipse(level = 0.95, linetype = 'solid',
  16.  
    size = 1, show.legend = FALSE)
  17.  
    scale_color_manual(values = c('#008000','#FFA74F'))
  18.  
    theme_bw()
  19.  
    theme(legend.position = c(0.9,0.8),
  20.  
    legend.text = element_text(color = 'black',size = 12, family = 'Arial', face = 'plain'),
  21.  
    panel.background = element_blank(),
  22.  
    panel.grid = element_blank(),
  23.  
    axis.text = element_text(color = 'black',size = 15, family = 'Arial', face = 'plain'),
  24.  
    axis.title = element_text(color = 'black',size = 15, family = 'Arial', face = 'plain'),
  25.  
    axis.ticks = element_line(color = 'black'))
  26.  
    ggsave(p1, filename = 'figures/pls.pdf',
  27.  
    width = 5, height = 5, device = cairo_pdf)

和PCA相比,PLS-DA的效果相对较好。PLS-DA分析的目的是找到差异变量(本例中的109种代谢物的某几种)。因此,需要找到VIP值大于1的变量(代谢物):

复制成功

r

  1.  
    # VIP scores plot
  2.  
    vip.score = as.data.frame(plsda@vipVn)
  3.  
    colnames(vip.score) = 'vip'
  4.  
    vip.score$metabolites = rownames(vip.score)
  5.  
    vip.score = vip.score[order(-vip.score$vip),]
  6.  
    vip.score$metabolites = factor(vip.score$metabolites,
  7.  
    levels = vip.score$metabolites)
  8.  
     
  9.  
    loading.score = plsda@loadingMN %>% as.data.frame()
  10.  
    loading.score$metabolites = rownames(loading.score)
  11.  
     
  12.  
    all.score = merge(vip.score, loading.score, by = 'metabolites')
  13.  
     
  14.  
    all.score$cat = paste('A',1:nrow(all.score), sep = '')
  15.  
     
  16.  
    p2 = ggplot(all.score[all.score$vip >= 1,], aes(cat, vip))
  17.  
    geom_segment(aes(x = cat, xend = cat,
  18.  
    y = 0, yend = vip))
  19.  
    geom_point(shape = 21, size = 5, color = '#008000' ,fill = '#008000')
  20.  
    geom_point(aes(1,2.5), color = 'white')
  21.  
    geom_hline(yintercept = 1, linetype = 'dashed')
  22.  
    scale_y_continuous(expand = c(0,0))
  23.  
    labs(x = '', y = 'VIP value')
  24.  
    theme_bw()
  25.  
    theme(legend.position = 'none',
  26.  
    legend.text = element_text(color = 'black',size = 12, family = 'Arial', face = 'plain'),
  27.  
    panel.background = element_blank(),
  28.  
    panel.grid = element_blank(),
  29.  
    axis.text = element_text(color = 'black',size = 15, family = 'Arial', face = 'plain'),
  30.  
    axis.text.x = element_text(angle = 90),
  31.  
    axis.title = element_text(color = 'black',size = 15, family = 'Arial', face = 'plain'),
  32.  
    axis.ticks = element_line(color = 'black'),
  33.  
    axis.ticks.x = element_blank())
  34.  
    ggsave(p2, filename = 'figures/pls_VIP.pdf',
  35.  
    width = 8, height = 5, device = cairo_pdf)

下面这些物质就是差异代谢物(VIP值大于等于1):

r

  1.  
    [1] (gamma)Glu-Leu/Ile
  2.  
    [2] 2-acetamido-4-methylphenyl acetate
  3.  
    [3] 2-Methylhippuric acid
  4.  
    [4] 3-Methylcrotonylglycine
  5.  
    [5] 3,4-Dihydroxybenzeneacetic acid
  6.  
    [6] 3,5-dihydroxybenzoic acid/3,4-dihydroxybenzoic acid
  7.  
    [7] 4-Acetamidobutanoic acid isomer 3
  8.  
    [8] 6-(carboxymethoxy)-hexanoic acid
  9.  
    [9] Acetaminophen glucuronide
  10.  
    [10] Acetylphenylalanine
  11.  
    [11] alpha-N-Phenylacetyl-glutamine
  12.  
    [12] Asp-Leu/Ile isomer 1
  13.  
    [13] Citric acid
  14.  
    [14] Dehydroepiandrosterone 3-glucuronide
  15.  
    [15] Dehydroepiandrosterone sulfate
  16.  
    [16] Gluconic acid and/or isomers
  17.  
    [17] Glucuronic acid and/or isomers
  18.  
    [18] Glyceric acid
  19.  
    [19] Hippuric acid
  20.  
    [20] Hydroxybenzyl alcohol isomer
  21.  
    [21] Malic acid
  22.  
    [22] Methyl (hydroxymethyl)pyrrolidine-carboxylate/Methyl (hydroxy)piperidine-carboxylate
  23.  
    [23] Monoethyl phthalate
  24.  
    [24] N-Acetyl-aspartic acid
  25.  
    [25] Oxoglutaric acid
  26.  
    [26] p-Anisic acid
  27.  
    [27] p-Hydroxyhippuric acid
  28.  
    [28] Pantothenic acid
  29.  
    [29] Pentose
  30.  
    [30] Phe-Tyr-Asp (and isomers)
  31.  
    [31] Pyruvic acid
  32.  
    [32] Testosterone glucuronide
  33.  
    [33] Threonic acid/Erythronic acid
  34.  
    [34] Valerylglycine isomer 1
  35.  
    [35] Valerylglycine isomer 2

将它们的VIP值得进行可视化(某些代谢物名称太长,进行转换表示):

OPLS-DA分析

复制成功

r

  1.  
    # OPLS-DA analysis
  2.  
    oplsda = opls(dataMatrix, genderFc, predI = 1, orthoI = NA)
  3.  
     
  4.  
    # sample scores plot
  5.  
    sample.score = oplsda@scoreMN %>%
  6.  
    as.data.frame() %>%
  7.  
    mutate(gender = sacurine[["sampleMetadata"]][["gender"]],
  8.  
    o1 = oplsda@orthoScoreMN[,1])
  9.  
     
  10.  
    p3 = ggplot(sample.score, aes(p1, o1, color = gender))
  11.  
    geom_hline(yintercept = 0, linetype = 'dashed', size = 0.5)
  12.  
    geom_vline(xintercept = 0, linetype = 'dashed', size = 0.5)
  13.  
    geom_point()
  14.  
    #geom_point(aes(-10,-10), color = 'white')
  15.  
    labs(x = 'P1(5.0%)',y = 'to1')
  16.  
    stat_ellipse(level = 0.95, linetype = 'solid',
  17.  
    size = 1, show.legend = FALSE)
  18.  
    scale_color_manual(values = c('#008000','#FFA74F'))
  19.  
    theme_bw()
  20.  
    theme(legend.position = c(0.1,0.85),
  21.  
    legend.title = element_blank(),
  22.  
    legend.text = element_text(color = 'black',size = 12, family = 'Arial', face = 'plain'),
  23.  
    panel.background = element_blank(),
  24.  
    panel.grid = element_blank(),
  25.  
    axis.text = element_text(color = 'black',size = 15, family = 'Arial', face = 'plain'),
  26.  
    axis.title = element_text(color = 'black',size = 15, family = 'Arial', face = 'plain'),
  27.  
    axis.ticks = element_line(color = 'black'))
  28.  
    ggsave(p3, filename = 'figures/opls.pdf',
  29.  
    width = 5, height = 5, device = cairo_pdf)

可以看到的是OPLS-DA的效果比PLS-DA更好一些。

同样进行差异代谢物筛选:

复制成功

r

  1.  
    # VIP scores plot
  2.  
    vip.score = as.data.frame(oplsda@vipVn)
  3.  
    colnames(vip.score) = 'vip'
  4.  
    vip.score$metabolites = rownames(vip.score)
  5.  
    vip.score = vip.score[order(-vip.score$vip),]
  6.  
    vip.score$metabolites = factor(vip.score$metabolites,
  7.  
    levels = vip.score$metabolites)
  8.  
     
  9.  
    loading.score = oplsda@loadingMN %>% as.data.frame()
  10.  
    loading.score$metabolites = rownames(loading.score)
  11.  
     
  12.  
    all.score = merge(vip.score, loading.score, by = 'metabolites')
  13.  
     
  14.  
    all.score$cat = paste('A',1:nrow(all.score), sep = '')
  15.  
     
  16.  
    p4 = ggplot(all.score[all.score$vip >= 1,], aes(cat, vip))
  17.  
    geom_segment(aes(x = cat, xend = cat,
  18.  
    y = 0, yend = vip))
  19.  
    geom_point(shape = 21, size = 5, color = '#008000' ,fill = '#008000')
  20.  
    geom_point(aes(1,2.5), color = 'white')
  21.  
    geom_hline(yintercept = 1, linetype = 'dashed')
  22.  
    scale_y_continuous(expand = c(0,0))
  23.  
    labs(x = '', y = 'VIP value')
  24.  
    theme_bw()
  25.  
    theme(legend.position = 'none',
  26.  
    legend.text = element_text(color = 'black',size = 12, family = 'Arial', face = 'plain'),
  27.  
    panel.background = element_blank(),
  28.  
    panel.grid = element_blank(),
  29.  
    axis.text = element_text(color = 'black',size = 15, family = 'Arial', face = 'plain'),
  30.  
    axis.text.x = element_text(angle = 90),
  31.  
    axis.title = element_text(color = 'black',size = 15, family = 'Arial', face = 'plain'),
  32.  
    axis.ticks = element_line(color = 'black'),
  33.  
    axis.ticks.x = element_blank())
  34.  
    p4
  35.  
    ggsave(p4, filename = 'figures/opls_VIP.pdf',
  36.  
    width = 8, height = 5, device = cairo_pdf)

下面的的是差异代谢物:

r

  1.  
    [1] (gamma)Glu-Leu/Ile
  2.  
    [2] 2-acetamido-4-methylphenyl acetate
  3.  
    [3] 2-Methylhippuric acid
  4.  
    [4] 3-Methylcrotonylglycine
  5.  
    [5] 3,4-Dihydroxybenzeneacetic acid
  6.  
    [6] 3,5-dihydroxybenzoic acid/3,4-dihydroxybenzoic acid
  7.  
    [7] 4-Acetamidobutanoic acid isomer 3
  8.  
    [8] 6-(carboxymethoxy)-hexanoic acid
  9.  
    [9] Acetaminophen glucuronide
  10.  
    [10] Acetylphenylalanine
  11.  
    [11] alpha-N-Phenylacetyl-glutamine
  12.  
    [12] Asp-Leu/Ile isomer 1
  13.  
    [13] Citric acid
  14.  
    [14] Dehydroepiandrosterone 3-glucuronide
  15.  
    [15] Dehydroepiandrosterone sulfate
  16.  
    [16] Gluconic acid and/or isomers
  17.  
    [17] Glucuronic acid and/or isomers
  18.  
    [18] Glyceric acid
  19.  
    [19] Hippuric acid
  20.  
    [20] Hydroxybenzyl alcohol isomer
  21.  
    [21] Malic acid
  22.  
    [22] Methyl (hydroxymethyl)pyrrolidine-carboxylate/Methyl (hydroxy)piperidine-carboxylate
  23.  
    [23] Monoethyl phthalate
  24.  
    [24] N-Acetyl-aspartic acid
  25.  
    [25] Oxoglutaric acid
  26.  
    [26] p-Anisic acid
  27.  
    [27] p-Hydroxyhippuric acid
  28.  
    [28] Pantothenic acid
  29.  
    [29] Pentose
  30.  
    [30] Phe-Tyr-Asp (and isomers)
  31.  
    [31] Pyruvic acid
  32.  
    [32] Testosterone glucuronide
  33.  
    [33] Threonic acid/Erythronic acid
  34.  
    [34] Valerylglycine isomer 1
  35.  
    [35] Valerylglycine isomer 2

模型训练与预测

这些降维的方法都属于机器学习算法,那就可以对模型进行训练,并用这个模型去预测未知的数据。

模型训练很简单:

复制成功

r

  1.  
    # model training
  2.  
    oplsda.2 = opls(dataMatrix, genderFc, predI = 1, orthoI = NA,subset = "odd")

r

  1.  
    OPLS-DA
  2.  
    92 samples x 109 variables and 1 response
  3.  
    standard scaling of predictors and response(s)
  4.  
    R2X(cum) R2Y(cum) Q2(cum)
  5.  
    Total 0.26 0.825 0.608
  6.  
    RMSEE RMSEP pre ort
  7.  
    Total 0.213 0.341 1 2
  8.  
    Warning message:
  9.  
    'permI' set to 0 because train/test partition is selected.

可以看到使用了92个样本进行训练。

先看看模型在训练集上的准确率:

复制成功

r

  1.  
    # 模型在训练集上的准确率
  2.  
    trainVi = getSubsetVi(oplsda.2)
  3.  
    tab = table(genderFc[trainVi], fitted(oplsda.2))
  4.  
    print(paste('模型准确率:',round(sum(diag(tab))/sum(tab)*100, 2),'%', sep = ''))

r

  1.  
    M F
  2.  
    M 50 0
  3.  
    F 0 42
  4.  
    [1] "模型准确率:100%"

看模型在训练集上的准确率是没有什么意义的,要是在训练集的表现都不好,那模型一定不好。那看看模型在未知数据上的预测准确率吧:

复制成功

r

  1.  
    # model on test data
  2.  
    tab2 = table(genderFc[-trainVi],predict(oplsda.2, dataMatrix[-trainVi, ]))
  3.  
    print(paste('模型准确率:',round(sum(diag(tab2))/sum(tab2)*100, 2),'%', sep = ''))

r

  1.  
    M F
  2.  
    M 43 7
  3.  
    F 7 34
  4.  
    [1] "模型准确率:84.62%"

这个准确率已经很棒了。

差异代谢物其他筛选方法

除了用(O)PLS-DA中的VIP值对代谢物进行筛选,还可以用别的方法进行筛选,如log2FC等。通常是绘制火山图。

复制成功

r

  1.  
    # volcano plot
  2.  
    df = dataMatrix %>% as.data.frame()
  3.  
    df$gender = sacurine[["sampleMetadata"]][["gender"]]
  4.  
    df = df[order(df$gender),]
  5.  
    df = df[,-110]
  6.  
     
  7.  
    M.mean = apply(df[1:100,],2,FUN = mean)
  8.  
    F.mean = apply(df[101:183,],2,FUN = mean)
  9.  
     
  10.  
    FC = M.mean / F.mean
  11.  
    log2FC = log(FC,2)
  12.  
     
  13.  
    pvalue = apply(df, 2, function(x)
  14.  
    {t.test(x[1:100],x[101:183])$p.value})
  15.  
     
  16.  
    p.adj = p.adjust(pvalue, method = 'BH')
  17.  
    p.adj.log = -log10(p.adj)
  18.  
     
  19.  
    colcano.df = data.frame(log2FC,p.adj, p.adj.log)
  20.  
    colcano.df$cat = ifelse(colcano.df$log2FC >= 1 & colcano.df$p.adj < 0.05,'Up',
  21.  
    ifelse(colcano.df$log2FC <= -1 & colcano.df$p.adj < 0.05,'Down','NS'))
  22.  
     
  23.  
    p5 = ggplot(colcano.df, aes(log2FC, p.adj.log))
  24.  
    geom_point()
  25.  
    labs(y = '-log10(p-value.adj)')
  26.  
    theme_bw()
  27.  
    theme(legend.position = 'none',
  28.  
    legend.text = element_text(color = 'black',size = 12, family = 'Arial', face = 'plain'),
  29.  
    panel.background = element_blank(),
  30.  
    panel.grid = element_blank(),
  31.  
    axis.text = element_text(color = 'black',size = 15, family = 'Arial', face = 'plain'),
  32.  
    axis.text.x = element_text(angle = 90),
  33.  
    axis.title = element_text(color = 'black',size = 15, family = 'Arial', face = 'plain'),
  34.  
    axis.ticks = element_line(color = 'black'),
  35.  
    axis.ticks.x = element_blank())
  36.  
    p5
  37.  
    ggsave(p5, filename = '20201214PLSDA分析/figures/volcano.pdf',
  38.  
    width = 5, height = 5, device = cairo_pdf)

可能是这个例子中的代谢物太少了,导致算完以后都没有差异代谢物了。

参考文献

[1] Thévenot E A, Roux A, Xu Y, et al. Analysis of the human adult urinary metabolome variations with age, body mass index, and gender by implementing a comprehensive workflow for univariate and OPLS statistical analyses[J]. Journal of proteome research, 2015, 14(8): 3322-3335.

这篇好文章是转载于:学新通技术网

  • 版权申明: 本站部分内容来自互联网,仅供学习及演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,请提供相关证据及您的身份证明,我们将在收到邮件后48小时内删除。
  • 本站站名: 学新通技术网
  • 本文地址: /boutique/detail/tanhihcfgi
系列文章
更多 icon
同类精品
更多 icon
继续加载