Data needed for difference analysis: Expression Matrix and Grouping Information
TCGA data only needs to express the matrix, because the sample ID of TCGA is quite special. Whether the 14th and 15th digits of the sample ID are >=10
or <10
, it represents whether the sample is a normal sample or a tumor sample.
The starting point of the three R packages for difference analysis is the count matrix (reads count matrix). The count matrix cannot be directly used for difference analysis, so the three R packages have their own processing methods for the count matrix.
If you can't get the count matrix
Ready to work
if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(DESeq2))BiocManager::install('DESeq2')
if(!require(edgeR))BiocManager::install('edgeR')
if(!require(limma))BiocManager::install('limma')
rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
table(Group)
Deseq2 analysis requires two steps:
DESeqDataSetFromMatrix()
function is to generate the data type required by Deseq2 from the matrix;DESeq()
for difference analysislibrary(DESeq2)
#Generate a data frame whose row name is the sample id and the grouping information is listed
colData <- data.frame(row.names =colnames(exp),
condition=Group)
dds <- DESeqDataSetFromMatrix(
countData = exp,
colData = colData,
design = ~ condition)
dds <- DESeq(dds)
save(dds,file = paste0(proj,"_dd.Rdata"))
countData = exp
: Expression matrixcolData = colData
: Express the correspondence between matrix column names and groupingsdesign = ~ condition)
: Experimental design, condition corresponds to the condition in colData, which is the grouping informationdds <- DESeq(dds)
: Difference analysis, although the generated dds is still a Deseq2 object, it can be easily converted into a data frameExtract the difference expression matrix from the difference analysis result
load(file = paste0(proj,"_dd.Rdata"))
res <- results(dds, contrast = c("condition",rev(levels(Group))))
resOrdered <- res[order(res$pvalue),]
DEG <- as.data.frame(resOrdered)
DEG = na.omit(DEG)
View(DEG)
results()
: function: Extract the differential expression results from dds, and the generated object is Deseq2, which can be converted into a data framecontrast
parameter must be written in the following three-element vector format, and the order cannot be reversed (refer to the help document).resOrdered <- res[order(res$pvalue),]
: Sort by P value (only Deseq2 needs, limma and EdgeR will be sorted automatically)DEG <- as.data.frame(resOrdered)
: Convert to data frameDEG = na.omit(DEG)
: If this step is not present, NA will appear after calculation of some genes with very low expression levels, which will cause trouble for subsequent analysis and drawingAdd change column to mark genes up-regulated and down-regulated
#Set the threshold to mean+2sd
logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )
logFC_cutoff
# [1] 3.909132
k1 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
k2 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
# DOWN NOT UP
# 783 28724 841
head(DEG)
DESeq2_DEG <- DEG #Backup Results
logFC_cutoff
:The logFC_cutoff calculated by different R packages is differentPerform differential expression analysis and extract differential expression matrix
library(edgeR)
#Enter expression matrix and grouping information data
dge <- DGEList(counts=exp,group=Group)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
#Writing or not writing 0+ is the same
design <- model.matrix(~0+Group)
rownames(design)<-colnames(dge)
colnames(design)<-levels(Group)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1))
DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
View(DEG)
fit2 <- glmLRT(fit, contrast=c(-1,1))
: There is a difference between contrast
and DESeq2
here, here you only need to enter c(-1,1)
, -1
corresponds to normal
, which is the control group, and 1
corresponds to tumor
, which is the experimental group.Add change column to mark genes up-regulated and down-regulated
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff
# [1] 4.290788
k1 = (DEG$PValue < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$PValue < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
head(DEG)
table(DEG$change)
# DOWN NOT UP
# 533 28643 1172
edgeR_DEG <- DEG
library(limma)
design <- model.matrix(~0+Group) # get data: Group
colnames(design)=levels(Group)
rownames(design)=colnames(exp)
dge <- DGEList(counts=exp) #get data: exp
dge <- calcNormFactors(dge)
v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)
# Same as the above two packages, you need to explain who is better than whom
constrasts = paste(rev(levels(Group)),collapse = "-")
constrasts
# [1] "tumor-normal"
cont.matrix <- makeContrasts(contrasts=constrasts,levels = design)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)
DEG = topTable(fit2, coef=constrasts, n=Inf)
DEG = na.omit(DEG)
View(DEG)
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
k1 = (DEG$P.Value < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$P.Value < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
head(DEG)
# DOWN NOT UP
# 1060 28915 373
limma_voom_DEG <- DEG
tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
edgeR = as.integer(table(edgeR_DEG$change)),
limma_voom = as.integer(table(limma_voom_DEG$change)),
row.names = c("down","not","up")
);tj
save(DESeq2_DEG,edgeR_DEG,limma_voom_DEG,Group,tj,file = paste0(proj,"_DEG.Rdata"))
Check how many up-regulated and down-regulated genes are obtained in each of the 3 R packages
tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
edgeR = as.integer(table(edgeR_DEG$change)),
limma_voom = as.integer(table(limma_voom_DEG$change)),
row.names = c("down","not","up")
tj
# deseq2 edgeR limma_voom
# down 783 533 1060
# not 28724 28643 28915
# up 841 1172 373
The results are still quite different, but there is no right or wrong, but the algorithm is different.
rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
# install tinyarray
if(!require(tinyarray))devtools::install_local("tinyarray-master.zip",upgrade = F)
library(ggplot2)
library(tinyarray)
exp[1:4,1:4]
The matrix here is still the count number, and the count matrix is only used for difference analysis
# cpm Remove the influence of library size
dat = log2(cpm(exp)+1) # Get the matrix used for drawing
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(proj,"_pcaplot.Rdata"))
Select the difference genes made by three R packages separately and draw a heat map
cg1 = rownames(DESeq2_DEG)[DESeq2_DEG$change !="NOT"]
cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
cg3 = rownames(limma_voom_DEG)[limma_voom_DEG$change !="NOT"]
h1 = draw_heatmap(dat[cg1,],Group,n_cutoff = 2)
h2 = draw_heatmap(dat[cg2,],Group,n_cutoff = 2)
h3 = draw_heatmap(dat[cg3,],Group,n_cutoff = 2)
n_cutoff=2
, the color distribution is -2 to 2, and the color beyond this range is the same as -2 or 2 to eliminate the influence of extreme valuesCalculate the mean+2sd
threshold (the first three R packages calculated are all called logFC\_cutoff
, and they are not saved, recalculate here)
m2d = function(x){
mean(abs(x))+2*sd(abs(x))
}
v1 = draw_volcano(DESeq2_DEG,pkg = 1,logFC_cutoff = m2d(DESeq2_DEG$log2FoldChange))
v2 = draw_volcano(edgeR_DEG,pkg = 2,logFC_cutoff = m2d(edgeR_DEG$logFC))
v3 = draw_volcano(limma_voom_DEG,pkg = 3,logFC_cutoff = m2d(limma_voom_DEG$logFC))
pkg
parameter:a integer ,means which Differential analysis packages you used,we support three packages by now, 1,2,3,4 respectively means DESeq2
,edgeR
,limma(voom)
,limma
.Image stitching
library(patchwork)
(h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(proj,"_heat_vo.png"),width = 15,height = 10)
rm(list = ls())
load("TCGA-CHOL_gdc.Rdata")
load("TCGA-CHOL_DEG.Rdata")
load("TCGA-CHOL_pcaplot.Rdata")
UP=function(df){
rownames(df)[df$change=="UP"]
} #Select to increase ENSEMBLID
DOWN=function(df){
rownames(df)[df$change=="DOWN"]
} # Select to downgrade ENSEMBLID
# Take the intersection of upregulated genes
up = intersect(intersect(UP(DESeq2_DEG),UP(edgeR_DEG)),UP(limma_voom_DEG))
# Remove the intersection of down-regulated genes
down = intersect(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG)),DOWN(limma_voom_DEG))
Draw a heat map of common difference genes
dat = log2(cpm(exp)+1)
hp = draw_heatmap(dat[c(up,down),],Group,n_cutoff = 2)
Draw Venn diagrams for up-regulated and down-regulated genes
The list accepted by the drawing function here must be a named list, and the name is the name of each category that will appear on the graph
up_genes = list(Deseq2 = UP(DESeq2_DEG),
edgeR = UP(edgeR_DEG),
limma = UP(limma_voom_DEG))
down_genes = list(Deseq2 = DOWN(DESeq2_DEG),
edgeR = DOWN(edgeR_DEG),
limma = DOWN(limma_voom_DEG))
# Quotation mark is the name of the figure
up.plot <- draw_venn(up_genes,"UPgene")
down.plot <- draw_venn(down_genes,"DOWNgene")
Image stitching
library(patchwork)
#up.plot + down.plot
# Image stitching
pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect")
ggsave(paste0(proj,"_heat_ve_pca.png"),width = 15,height = 10)