install.packages("BiocManager") BiocManager::install(c("AnnotationDbi", "impute","GO.db", "preprocessCore"))install.packages(c("matrixStats", "Hmisc","foreach", "doParallel", "fastcluster", "dynamicTreeCut", "survival"))install.packages(c("WGCNA", "stringr", "reshape2"))
如果R语言3.5以下版本,安装命令如下:source("https://bioconductor.org/biocLite.R")biocLite(c("AnnotationDbi", "impute","GO.db", "preprocessCore"))install.packages(c("matrixStats", "Hmisc","foreach", "doParallel", "fastcluster", "dynamicTreeCut", "survival"))install.packages(c("WGCNA", "stringr", "reshape2"))
专有名词共表达网络:定义为加权基因网络点代表基因,边代表基因表达相关性;Module(模块):高度內连的基因集在无向网络中,模块内是高度相关的基因在有向网络中,模块内是高度正相关的基因;邻接矩阵(Adjacency matrix):基因和基因之间的加权相关性值构成的矩阵;软阈值:相关性值进行幂次运算幂次的值也就是软阈值;连接度(Connectivity):类似于网络中 "度"degree)的概念每个基因的连接度是与其相连的基因的边属性之和;# 加载库library(WGCNA);
Warning message:"package 'WGCNA' was built under R version 3.6.1"Loading required package: dynamicTreeCutLoading required package: fastclusterAttaching package: 'fastcluster'The following object is masked from 'package:stats': hclustAttaching package: 'WGCNA'The following object is masked from 'package:stats': cor
1 数据输入和清洗主要步骤如下:加载基因表达数据数据清洗加载临床特征数据数据下载地址:FemaleLiver-Data.zip下载的数据包FemaleLiver-Data.zip解压后包含文件为:ClinicalTraits.csvGeneAnnotation.csvLiverFemale3600.csv1.1 加载基因表达数据# 读取文件# The following setting is important, do not omit.# 如果没有显式地指定“stringsAsFactors=FALSE”,默认会将所有的字符串转换为因子,导致数据处理速度较慢options(stringsAsFactors = FALSE)# Read in the female liver data set 读取135个雌性小鼠的数据femData = read.csv("./data/LiverFemale3600.csv")# Take a quick look at what is in the data set:# 查看数据的维度dim(femData)# 预览数据head(femData)
3600143A data.frame: 6 × 143# 删除冗余数据-c(1:8)删除前8列数据,t()转置数据datExpr0 = as.data.frame(t(femData[, -c(1:8)]));head(datExpr0)
A data.frame: 6 × 3600# 将原数据的行列名复制过来names(datExpr0) = femData$substanceBXH;rownames(datExpr0) = names(femData)[-c(1:8)];head(datExpr0)
A data.frame: 6 × 3600gsg = goodSamplesGenes(datExpr0, verbose = 3)gsg$allOK;
Flagging genes and samples with too many missing values... ..step 1
TRUE如果样本检查语句返回TRUE,那么没有缺失值如果不是的话,我们会移除那些有问题的基因和样本通过以下代码来移除缺失值if (!gsg$allOK){ # Optionally, print the gene and sample names that were removed: # 打印删除的基因和样本名称 if (sum(!gsg$goodGenes)>0) printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", "))); if (sum(!gsg$goodSamples)>0) printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", "))); # Remove the offending genes and samples from the data: # 从数据中删除有问题的基因和样本 datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]}
接下来我们对样本进行聚类(与随后的基因聚类相比),看看是否有明显的异常值# hclusts聚类算法, dist计算基因之间的距离sampleTree = hclust(dist(datExpr0), method = "average");# Plot the sample tree: Open a graphic output window of size 12 by 9 inches# The user should change the dimensions if the window is too large or too small.# 绘制聚类树,sizeGrWindow设置绘图窗口大小# sizeGrWindow(16,9)pdf(file = "./plot/sampleClustering.pdf", width = 12, height = 9);# 设置文字大小par(cex = 0.5);# 设置图像边距c(bottom, left, top, right) # par(mar = c(0,4,2,0))# 画图 main标题,sub子标题,xlab x轴标题,cex.lab标题字体大小,cex.axis坐标轴刻度大小,cex.main主标题字体plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)dev.off()
png: 2由上图图能够看出,F2_221 这个样本和其他样本差距非常大,需要将该样本过滤掉过滤代码如下# Plot a line to show the cut# 设置文字大小par(cex = 0.5);plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)# 在上图上画红线abline(h = 15, col = "red");# Determine cluster under the line# 剪枝算法,cutHeight 修剪树枝的高度 minSize集群最小数clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)# 剪枝结果table(clust)# clust 1 contains the samples we want to keepkeepSamples = (clust==1)# 符合要求的数据datExpr = datExpr0[keepSamples, ]# 提取行nSamples = nrow(datExpr)# 提取列nGenes = ncol(datExpr)
clust 0 1 1 134
1.3 加载临床特征数据我们现在读取性状数据,并将测量它们的样本与表达样本相匹配traitData = read.csv("./data/ClinicalTraits.csv");dim(traitData)#names(traitData)# remove columns that hold information we do not need.# 删除不需要的列allTraits = traitData[, -c(31, 16)];allTraits = allTraits[, c(2, 11:36) ];dim(allTraits)head(allTraits)# names(allTraits)
3613836127A data.frame: 6 × 27# 形成一个类似于表达数据的数据框架,以保存临床特征# 提取行名femaleSamples = rownames(datExpr)# 数据匹配 返回匹配行traitRows = match(femaleSamples, allTraits$Mice);# 提取指定要求行datTraits = allTraits[traitRows, -1];# 提取行名rownames(datTraits) = allTraits[traitRows, 1];# 垃圾回收collectGarbage();
现在我们有了变量datExpr中的表达式数据,以及变量dattitries中相应的临床特征在我们继续进行网络构建和模块检测之前,我们将可视化临床特征与样本树状图的关系# Re-cluster samples# 画聚类图sampleTree2 = hclust(dist(datExpr), method = "average")# Convert traits to a color representation: white means low, red means high, grey means missing entry# 画表型的热图# 将特征转换为颜色表示:白色表示低,红色表示高,灰色表示缺少条目# 如果signed为true 以绿色开头代表最大负值,以白色开头代表零附近的值,然后变为红色代表正值traitColors = numbers2colors(datTraits, signed =FALSE);# Plot the sample dendrogram and the colors underneath.# 绘制出树状图和下面的颜色 plotDendroAndColors(sampleTree2, traitColors,groupLabels = names(datTraits),main = "Sample dendrogram and trait heatmap")
2 建设表达网络与模块检测此步骤是使用WGCNA方法进行所有网络分析的基础我们提出三种不同的方法构建网络并识别模块:使用方便的一步网络结构和模块检测功能,适合希望以最小努力达到结果的用户;为希望使用定制/替代方法进行实验的用户逐步构建网络和模块检测;一种自动分块网络结构和模块检测方法,适用于希望分析太大而无法同时分析的数据集的用户主要步骤如下:自动一步构建网络与模块检测其他检测算法2.1 自动一步构建网络与模块检测在本教程中,我们将演示一步式自动网络构建和模块检测,主要步骤有:软阈值的选择:网络拓扑分析一步构建网络与模块检测2.1.1 软阈值的选择:网络拓扑分析构建一个加权基因网络需要选择软阈值幂β来计算邻接矩阵权重参数,即将基因间的相关系数进行乘方运算来表征其相关性,首先需要确定乘方的值# Choose a set of soft-thresholding powers# 给出候选的β值,c(1:10)表示1到10;seq(from = 12, to=20, by=2)表示从12开始间隔两个数到20powers = c(c(1:10), seq(from = 12, to=20, by=2))powers# Call the network topology analysis function 调用网络拓扑分析函数# verbose表示输出结果详细程度sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 0);
123456789101214161820Warning message:"executing %dopar% sequentially: no parallel backend registered" Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.1 1 0.0278 0.345 0.456 747.00 762.0000 1210.02 2 0.1260 -0.597 0.843 254.00 251.0000 574.03 3 0.3400 -1.030 0.972 111.00 102.0000 324.04 4 0.5060 -1.420 0.973 56.50 47.2000 202.05 5 0.6810 -1.720 0.940 32.20 25.1000 134.06 6 0.9020 -1.500 0.962 19.90 14.5000 94.87 7 0.9210 -1.670 0.917 13.20 8.6800 84.18 8 0.9040 -1.720 0.876 9.25 5.3900 76.39 9 0.8590 -1.700 0.836 6.80 3.5600 70.510 10 0.8330 -1.660 0.831 5.19 2.3800 65.811 12 0.8530 -1.480 0.911 3.33 1.1500 58.112 14 0.8760 -1.380 0.949 2.35 0.5740 51.913 16 0.9070 -1.300 0.970 1.77 0.3090 46.814 18 0.9120 -1.240 0.973 1.39 0.1670 42.515 20 0.9310 -1.210 0.977 1.14 0.0951 38.7
# sft这中保存了每个powers值计算出来的网络特征,其中powerEstimate就是最佳power值,fitIndices保存了每个power对应的网络的特征str(sft)
List of 2 $ powerEstimate: num 6 $ fitIndices :'data.frame': 15 obs. of 7 variables: ..$ Power : num [1:15] 1 2 3 4 5 6 7 8 9 10 ... ..$ SFT.R.sq : num [1:15] 0.0278 0.1264 0.3404 0.5062 0.6807 ... ..$ slope : num [1:15] 0.345 -0.597 -1.03 -1.422 -1.716 ... ..$ truncated.R.sq: num [1:15] 0.456 0.843 0.972 0.973 0.94 ... ..$ mean.k. : num [1:15] 747 254.5 111 56.5 32.2 ... ..$ median.k. : num [1:15] 761.7 250.8 101.7 47.2 25.1 ... ..$ max.k. : num [1:15] 1206 574 324 202 134 ...
# Plot the results 结果绘图# 设置窗格大小#sizeGrWindow(9, 5)# 设置图的显示一行两列# par(mfrow = c(1,2));cex1 = 0.9;# Scale-free topology fit index as a function of the soft-thresholding power# 生成阈值和网络的特征之间的关系函数plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])sft$fitIndices[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence"))text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])sft$fitIndices[,2],labels=powers,cex=cex1,col="red");# this line corresponds to using an R^2 cut-off of habline(h=0.90,col="red")# sft$fitIndices 保存了每个power构建的相关性网络中的连接度的统计值,k就是连接度值,每个power值提供了max, median, max3种连接度的统计量# 对连接度的均值进行可视化# Mean connectivity as a function of the soft-thresholding powerplot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
2.1.2 一步构建网络与模块检测确定好power值即可构建基因网络,构建基因网络和识别模块现在是一个简单的函数调用:# datExpr表达数据,TOMType拓扑重叠矩阵计算方式,minModuleSize用于模块检测的最小模块尺寸,# reassignThreshold 是否在模块之间重新分配基因的p值比率阈值,mergeCutHeight 树状图切割高度# numericLabels 返回的模块应该用颜色(FALSE)还是数字(TRUE)标记,pamRespectsDendro树状图相关参数# saveTOMs 字符串的向量,saveTOMFileBase 包含包含共识拓扑重叠文件的文件名库的字符串net = blockwiseModules(datExpr, power = sft$powerEstimate,TOMType = "unsigned", minModuleSize = 30,reassignThreshold = 0, mergeCutHeight = 0.25,numericLabels = TRUE, pamRespectsDendro = FALSE,saveTOMs = TRUE, saveTOMFileBase = "femaleMouseTOM",verbose = 3)
Calculating module eigengenes block-wise from all genes Flagging genes and samples with too many missing values... ..step 1Cluster size 3600 broken into 2108 1492 Cluster size 2108 broken into 1126 982 Done cluster 1126 Done cluster 982 Done cluster 2108 Done cluster 1492 ..Working on block 1 . TOM calculation: adjacency.. ..will not use multithreading. Fraction of slow calculations: 0.396405 ..connectivity.. ..matrix multiplication (system BLAS).. ..normalization.. ..done. ..saving TOM for block 1 into file femaleMouseTOM-block.1.RData ....clustering.. ....detecting modules.. ....calculating module eigengenes.. ....checking kME in modules.. ..removing 1 genes from module 1 because their KME is too low. ..removing 1 genes from module 7 because their KME is too low. ..removing 1 genes from module 8 because their KME is too low. ..removing 1 genes from module 21 because their KME is too low. ..merging modules that are too close.. mergeCloseModules: Merging modules whose distance is less than 0.25 Calculating new MEs...
我们现在回到网络分析要查看标识了多少个模块以及模块大小table(net$colors)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 99 609 460 409 316 312 221 211 157 123 106 100 94 91 77 76 58 47 34
指示有18个模块,按大小降序标记为1至18,大小范围为609至34个基因 标签0保留用于所有模块外部的基因 用于模块识别的分层聚类树状图(树)以net $ dendrograms [[1]]返回; 可以使用以下代码将树状图与颜色分配一起显示:# open a graphics window# sizeGrWindow(12, 9)# Convert labels to colors for plotting# 将标签转化为绘图颜色mergedColors = labels2colors(net$colors)# Plot the dendrogram and the module colors underneath# 绘制树状图和下面的模块颜色# dendroLabels树状图标签设置为FALSE完全禁用树状图标签;设置为NULL使用的行标签datExpr# addGuide是否应在树状图中添加垂直的“指导线”?线条使识别单个样本的颜色代码更加容易plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],"Module colors", dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
结果图如上图所示我们注意到,如果用户想更改条件,则该软件包提供了recutBlockwiseTrees函数,该函数可以应用修改后的条件而不必重新计算网络和聚类树状图 这样可以节省大量时间 现在,我们保存后续分析所需的模块分配和模块本征信息moduleLabels = net$colorsmoduleColors = labels2colors(net$colors)MEs = net$MEs;geneTree = net$dendrograms[[1]];save(MEs, moduleLabels, moduleColors, geneTree,file = "FemaleLiver-02-networkConstruction-auto.RData")
2.2 其他检测算法其他算法包括分步网络构建和模块检测、处理大型数据集:逐块网络构建和模块检测就不再表述具体见下列链接:https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/FemaleLiver-02-networkConstr-man.pdf https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/FemaleLiver-02-networkConstr-blockwise.pdf3 筛选与表型相关的模块主要步骤如下:量化模块-特质关联基因与性状和重要模块的关系:基因重要性和模块成员模块内分析:鉴定具有高GS和MM的基因网络分析结果总结3.1 量化模块-特质关联在此分析中,我们想确定与所测量的临床特征显着相关的模块 由于我们已经为每个模块建立了一个概要文件(特征基因),因此我们只需将特征基因与外部特征相关联,然后寻找最重要的关联,实际上是计算模块的ME值与表型的相关系数:# Define numbers of genes and samples# 获得基因数和样本数nGenes = ncol(datExpr);nSamples = nrow(datExpr);# Recalculate MEs with color labels# 用彩色标签重新计算MEs# 在给定的单个数据集中计算模块的模块本征基因MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes# 对给定的(特征)向量进行重新排序,以使相似的向量(通过相关性度量)彼此相邻MEs = orderMEs(MEs0)# 计算module的ME值与表型的相关系数moduleTraitCor = cor(MEs, datTraits, use = "p");moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
由于我们的模块和特征数量适中,因此合适的图形表示形式将有助于阅读表格 我们通过相关值对每个关联进行颜色编码:names(MEs)
'MEmagenta''MEblack''MEturquoise''MEgreen''MElightcyan''MEblue''MEbrown''MEred''MEsalmon''MEyellow''MElightgreen''MEgreenyellow''MEgrey60''MEpink''MEpurple''MEtan''MEcyan''MEmidnightblue''MEgrey'# sizeGrWindow(10,6)# 显示相关性及其p值textMatrix = paste(signif(moduleTraitCor, 2), "\n(",signif(moduleTraitPvalue, 1), ")", sep = "");dim(textMatrix) = dim(moduleTraitCor)par(mar = c(6, 8.5, 3, 3));# Display the correlation values within a heatmap plot\# ySymbols 当ylabels使用时所使用的其他标签; colorLabels 应该使用颜色标签吗# colors 颜色; textMatrix 单元格名字labeledHeatmap(Matrix = moduleTraitCor,xLabels = names(datTraits),yLabels = names(MEs),ySymbols = names(MEs), colorLabels = FALSE,colors = greenWhiteRed(50),textMatrix = textMatrix,setStdMargins = FALSE, cex.text = 0.4,zlim = c(-1,1),main = paste("Module-trait relationships"))
Warning message in greenWhiteRed(50):"WGCNA::greenWhiteRed: this palette is not suitable for peoplewith green-red color blindness (the most common kind of color blindness).Consider using the function blueWhiteRed instead."
sizeGrWindow(10,6)# Will display correlations and their p-valuesdim(textMatrix) = dim(moduleTraitCor)par(mar = c(6, 8.5, 3, 3));# Display the correlation values within a heatmap plotlabeledHeatmap(Matrix = moduleTraitCor,xLabels = names(datTraits),yLabels = names(MEs),ySymbols = names(MEs),colorLabels = FALSE,colors = greenWhiteRed(50),textMatrix = textMatrix,setStdMargins = FALSE,cex.text = 0.5,zlim = c(-1,1),main = paste("Module-trait relationships"))
Warning message in greenWhiteRed(50):"WGCNA::greenWhiteRed: this palette is not suitable for peoplewith green-red color blindness (the most common kind of color blindness).Consider using the function blueWhiteRed instead."
3.2 基因与性状和重要模块的关系:基因重要性和模块成员我们量化阵列上所有基因与每个模块的相似性寻找重要模块# Define variable weight containing the weight column of datTrait# 定义包含数据特征权重列的变量权重weight = as.data.frame(datTraits$weight_g);names(weight) = "weight"geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));# 模块的名称(颜色) substring提取文本从第3个字母开始modNames = substring(names(MEs), 3)# 基因和模块的相关系数geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));names(geneModuleMembership) = paste("MM", modNames, sep="");names(MMPvalue) = paste("p.MM", modNames, sep="");#gene和性状的关系geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p"));GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));names(geneTraitSignificance) = paste("GS.", names(weight), sep="");names(GSPvalue) = paste("p.GS.", names(weight), sep="");
3.3 模内分析:鉴定具有高GS和MM的基因使用GS和MM度量,我们可以鉴定出对体重以及在感兴趣的模块中具有较高模块成员性具有重要意义的基因 例如,我们看一下与重量关联最高的棕色模块 我们在棕色模块中绘制了基因重要性与模块成员关系的散点图 在此模块中,GS和MM之间存在高度显着的相关性# 模型颜色module = "brown"# 匹配列column = match(module, modNames);moduleGenes = moduleColors==module;#sizeGrWindow(7, 7);par(mfrow = c(1,1));# 画散点图verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, 1]), xlab = paste("Module Membership in", module, "module"), ylab = "Gene significance for body weight", main = paste("Module membership vs. gene significance\n"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
3.4 网络分析结果总结我们发现模块与我们的兴趣特征高度相关,并通过模块成员度量标准确定了其主要参与者 现在,我们将此统计信息与基因注释合并,并写出一个文件,该文件总结了最重要的结果,并且可以在标准电子表格软件(例如MS Excel或Open Office Calc)中进行检查 我们的表达式数据仅由探针ID名称注释:# 提取表带数据样本名称# names(datExpr);# 指定颜色数据名称# names(datExpr)[moduleColors=="brown"]
# 基因注释数据annot = read.csv(file = "./data/GeneAnnotation.csv");dim(annot)names(annot)probes = names(datExpr)probes2annot = match(probes, annot$substanceBXH)# The following is the number or probes without annotation:sum(is.na(probes2annot))
2338834'X''ID''arrayname''substanceBXH''gene_symbol''LocusLinkID''OfficialGeneSymbol''OfficialGeneName''LocusLinkSymbol''LocusLinkName''ProteomeShortDescription''UnigeneCluster''LocusLinkCode''ProteomeID''ProteomeCode''SwissprotID''OMIMCode''DirectedTilingPriority''AlternateSymbols''AlternateNames''SpeciesID''cytogeneticLoc''Organism''clustername''reporterid''probeid''sequenceid''clusterid''chromosome''startcoordinate''endcoordinate''strand''sequence_3_to_5_prime''sequence_5_to_3_prime'0现在,我们创建一个数据框,其中包含所有探针的以下信息:探针ID,基因符号,基因座ID(Entrez码),模块颜色,重量的基因重要性以及模块中所有模块的成员和p值 这些模块将按照其重量的重要性进行排序,而最重要的模块则位于左侧# Create the starting data framegeneInfo0 = data.frame(substanceBXH = probes,geneSymbol = annot$gene_symbol[probes2annot],LocusLinkID = annot$LocusLinkID[probes2annot],moduleColor = moduleColors,geneTraitSignificance,GSPvalue)# Order modules by their significance for weightmodOrder = order(-abs(cor(MEs, weight, use = "p")));# Add module membership information in the chosen orderfor (mod in 1:ncol(geneModuleMembership)){ oldNames = names(geneInfo0) geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]], MMPvalue[, modOrder[mod]]); names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""), paste("p.MM.", modNames[modOrder[mod]], sep=""))}# Order the genes in the geneInfo variable first by module color, then by geneTraitSignificancegeneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight));geneInfo = geneInfo0[geneOrder, ]write.csv(geneInfo, file = "geneInfo.csv")
4 使用WGCNA进行网络可视化主要步骤如下:显示基因网络可视化特征基因网络4.1 显示基因网络可视化加权网络的一种方法是绘制其热图,热图的每一行和每一列都对应一个基因 热图可以描述邻接或拓扑重叠,浅色表示低邻接(重叠),而深色表示更高的邻接(重叠) 另外,沿着热图的顶部和左侧绘制了基因树状图和模块颜色# Calculate topological overlap anew: this could be done more efficiently by saving the TOM# calculated during module detection, but let us do it again here.# 重新计算拓扑重叠:通过保存TOM可以更有效地完成此操作# 是在模块检测期间计算的,但让我们在这里再次进行dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);# Transform dissTOM with a power to make moderately strong connections more visible in the heatmap# 变换dissTOMplotTOM = dissTOM^7;# Set diagonal to NA for a nicer plotdiag(plotTOM) = NA;# Call the plot function# sizeGrWindow(9,9)# 基因的聚类树聚类时的距离为1-TOM值结合基因间的距离,即1-TOM值,用热图展示# TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
TOM calculation: adjacency....will not use multithreading. Fraction of slow calculations: 0.396405..connectivity....matrix multiplication (system BLAS)....normalization....done.
请注意,生成热图图可能要花费大量时间 可以限制基因数量以加快绘图速度 但是,一个基因子集的基因树状图通常看起来与所有基因的基因树状图不同 在以下示例中,我们将绘制的基因数限制为400:nSelect = 400# For reproducibility, we set the random seedset.seed(10);select = sample(nGenes, size = nSelect);selectTOM = dissTOM[select, select];# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.# 重新画聚类图selectTree = hclust(as.dist(selectTOM), method = "average")selectColors = moduleColors[select];# Open a graphical window# sizeGrWindow(9,9)# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing# the color palette; setting the diagonal to NA also improves the clarity of the plotplotDiss = selectTOM^7;diag(plotDiss) = NA;TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
4.2 可视化特征基因网络研究找到的模块之间的关系通常很有趣 可以使用特征基因作为代表特征,并通过特征基因相关性来量化模块相似性 该软件包包含一个方便的函数plotEigengeneNetworks,该函数生成特征基因网络的摘要图 通常,向特征基因添加临床特征(或多个特征)以了解特征如何适合特征基因网络是有益的# Recalculate module eigengenes# 重新计算基因特征值MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes# Isolate weight from the clinical traitsweight = as.data.frame(datTraits$weight_g);names(weight) = "weight"# Add the weight to existing module eigengenesMET = orderMEs(cbind(MEs, weight))# Plot the relationships among the eigengenes and the trait#sizeGrWindow(5,7.5);par(cex = 0.9)# 画树形图# marDendro给出树状图的边距设置,marHeatmap热图边距设置plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
该函数生成特征基因和特征的树状图,以及它们之间关系的热图 要拆分树状图和热图图,我们可以使用以下代码# Plot the dendrogram# sizeGrWindow(6,6);par(cex = 1.0)plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),plotHeatmaps = FALSE)# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)par(cex = 1.0)plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),plotDendrograms = FALSE, xLabelsAngle = 90)
上图为上述代码的输出本征基因树状图和热图识别了被称为元模块的相关本征基因群例如,树状图表明,红、棕、蓝三个模块具有高度的相关性,它们之间的相互关系强于它们与体重的相关性5 参考https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/ https://www.jianshu.com/p/e9cc3f43441dhttps://www.jianshu.com/p/25905a905086(图片来源网络,侵删)
0 评论