After introducing the query download and data analysis functions of TCGAbiolinks
, we briefly show a few examples to practice hands and deepen the understanding and use of this package
We mainly start from:
The 3 dimensions are illustrated with examples
Copy number variation (CNV
) plays an important role in the occurrence and development of cancer. Due to genome rearrangement (such as chromosome deletion, duplication, insertion and translocation), resulting in the amplification or censorship of chromosome fragments.
CNV
is the amplification and censorship of copy number of genomic regions larger than 1kb
.
TCGA's data can be divided into three levels:
level 1
: original sequencing data (fasta, fastq, etc.)level 2
: compared bam fileslevel 3
: standardized dataWe need to obtain level 3
We analyze the data of the Affymetrix Genome-Wide Human SNP Array 6.0 platform in the legacy database
Obtain CNV data of 20 GBM and LGG samples data for analysis to identify cancer genome mutations
query.gbm.nocnv <- GDCquery(
project = "TCGA-GBM",
data.category = "Copy number variation",
legacy = TRUE,
file.type = "nocnv_hg19.seg",
sample.type = c("Primary Tumor")
)
# Only select 20 samples
query.gbm.nocnv$results[[1]] <- query.gbm.nocnv$results[[1]][1:20,]
GDCdownload(query.gbm.nocnv)
cnvMatrix <- GDCprepare(query.gbm.nocnv)
> head(cnvMatrix)
# A tibble: 6 x 6
Sample Chromosome Start End Num_Probes Segment_Mean
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 TCGA-28-2513-01A-01D-0784-01 1 3218610 74709711 40434 -0.0175
2 TCGA-28-2513-01A-01D-0784-01 1 74709724 74715340 3 -1.21
3 TCGA-28-2513-01A-01D-0784-01 1 74715466 233232858 79323 0.0051
4 TCGA-28-2513-01A-01D-0784-01 1 233233557 233234188 2 -1.56
5 TCGA-28-2513-01A-01D-0784-01 1 233234531 247813706 9387 0.0168
6 TCGA-28-2513-01A-01D-0784-01 2 484222 20397902 12886 0.0123
Cancer-related CNVs will appear repeatedly in many samples. We use the GAIA package to identify this remarkable recurrent CNV
The GAIA
algorithm is based on the statistical test of random arrangement, while considering the significance and homogeneity within the sample, iteratively deletes the peaks in the interval until the remaining peaks are not significant to identify the significant CNV
This package requires not only the observation data of the input sample, but also the metadata of the genomic probe, which we can download from the broadinstitute website
gdac.root <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/"
file <- paste0(gdac.root, "genome.info.6.0_hg19.na31_minus_frequent_nan_probes_sorted_2.1.txt")
if(!file.exists(basename(file)))
downloader::download(file, file.path("~/Downloads", basename(file)))
If the download is wrong, you can manually copy the link to the browser to download
markersMatrix <- readr::read_tsv(
file.path("~/Downloads", basename(file)),
col_names = FALSE, col_types = "ccn",
progress = FALSE
)
> head(markersMatrix)
# A tibble: 6 x 3
X1 X2 X3
<chr> <chr> <dbl>
1 CN_473963 1 61735
2 CN_473964 1 61808
3 CN_473965 1 61823
4 CN_477984 1 62152
5 CN_473981 1 62920
6 CN_473982 1 62937
Process CNV data matrix
cnvMatrix %<>%
filter(abs(Segment_Mean) > 0.3) %>%
mutate(label = if_else(Segment_Mean < -0.3, 0, 1)) %>%
dplyr::select(-Segment_Mean) %>%
`names<-`(c("Sample.Name", "Chromosome", "Start", "End", "Num.of.Markers", "Aberration")) %>%
mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome),
Chromosome = ifelse(Chromosome == "Y", 24, Chromosome),
Chromosome = as.integer(Chromosome)) %>%
# Be careful to convert to data.frame format
as.data.frame()
Download some common CNV area data, you need to filter out these areas
file <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/CNV.hg19.bypos.111213.txt"
if(!file.exists(basename(file)))
downloader::download(file, file.path("~/Downloads", basename(file)))
commonCNV <- readr::read_tsv(
file.path("~/Downloads", basename(file)),
progress = FALSE
) %>%
mutate(markerID = paste(Chromosome, Start, sep = ":"))
Process probe data
markersMatrix_filtered <- markersMatrix %>%
`names<-`(c("Probe.Name", "Chromosome", "Start")) %>%
mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome),
Chromosome = ifelse(Chromosome == "Y", 24, Chromosome),
Chromosome = as.integer(Chromosome)) %>%
mutate(markerID = paste(Chromosome, Start, sep = ":")) %>%
filter(!duplicated(markerID)) %>%
# Filter some common CNVs
filter(!markerID %in% commonCNV$markerID)
Run gaia
, identify recurrent CNV
library(gaia)
set.seed(200)
markers_obj <- load_markers(as.data.frame(markersMatrix_filtered))
nbsamples <- length(unique(cnvMatrix$Sample.Name))
cnv_obj <- load_cnv(cnvMatrix, markers_obj, nbsamples)
suppressWarnings({
cancer <- "GBM"
results <- runGAIA(
cnv_obj, markers_obj,
output_file_name = paste0("~/Downloads/GAIA_", cancer, "_flt.txt"),
# Specify the type of variation to be analyzed, -1 to analyze all the variations
aberrations = -1,
# Specify the chromosomes to be analyzed, the default is -1, which means all chromosomes
chromosomes = -1,
# Using approximate methods can speed up the calculation
approximation = TRUE,
# Set the number of iterations
num_iterations = 5000,
threshold = 0.25
)
})
plot result
RecCNV <- as_tibble(results) %>%
mutate_at(vars("q-value"), as.numeric) %>%
mutate_at(vars(-"q-value"), as.integer) %>% {
# If q-value is 0, set its value to the smallest non-zero value in the data
minval = min(.$`q-value`[which(.$`q-value` != 0)])
mutate(., `q-value` = ifelse(`q-value` == 0, minval, `q-value`))
} %>%
mutate(score = -log10(`q-value`)) %>%
as.data.frame()
threshold <- 0.3
gaiaCNVplot(RecCNV,threshold)
save(results, RecCNV, threshold, file = paste0("~/Downloads/", cancer,"_CNV_results.rda"))
After using GAIA
to identify the recurring genomic region variation in cancer, we need to annotate it to the corresponding gene.
We use biomaRt
to obtain the genomic range information of all human genes, and then map the significantly mutated genomic regions to the corresponding genes
library(GenomicRanges)
# Use biomart to get the gene information in the GENCODE database
genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg19") %>%
filter(external_gene_name != "" & chromosome_name %in% c(1:22, "X", "Y")) %>%
mutate(
chromosome_name = ifelse(
chromosome_name == "X", 23, ifelse(chromosome_name == "Y", 24, chromosome_name)
),
chromosome_name = as.integer(chromosome_name)
) %>%
arrange(chromosome_name, start_position) %>%
dplyr::select(c("external_gene_name", "chromosome_name", "start_position","end_position")) %>%
`names<-`(c("GeneSymbol","Chr","Start","End"))
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
save(genes_GR, genes, file = "~/Downloads/genes_GR.rda", compress = "xz")
Annotate CNV regions to genes
# Select the required column
sCNV <- dplyr::select(RecCNV, c(1:4, 6)) %>%
filter(`q-value` <= threshold) %>%
arrange(1, 3) %>%
`names<-`(c("Chr","Aberration","Start","End","q-value"))
# Convert to GenomicRanges format
sCNV_GR <- makeGRangesFromDataFrame(sCNV, keep.extra.columns = TRUE)
# Find overlapping intervals
hits <- findOverlaps(genes_GR, sCNV_GR, type = "within")
sCNV_ann <- cbind(sCNV[subjectHits(hits),], genes[queryHits(hits),])
library(glue)
# Format output content
AmpDel_genes <- as_tibble(sCNV_ann, .name_repair = "universal") %>%
mutate(
AberrantRegion = glue("{Chr...1}:{Start...3}-{End...4}"),
GeneRegion = glue("{Chr...7}:{Start...8}-{End...9}")
) %>%
dplyr::select(c(6, 2, 5, 10, 11)) %>%
mutate(Aberration = if_else( Aberration == 0, "Del", "Amp"))
save(RecCNV, AmpDel_genes, file = paste0("~/Downloads/", cancer, "_CNV_results.rda"))
The output is as follows
> AmpDel_genes
# A tibble: 317 x 5
GeneSymbol Aberration q.value AberrantRegion GeneRegion
<chr> <chr> <dbl> <glue> <glue>
1 PIK3C2B Amp 0.125 1:204382589-204792596 1:204391756-204463852
2 MDM4 Amp 0.125 1:204382589-204792596 1:204485511-204542871
3 RP11-430C7.2 Amp 0.125 1:204382589-204792596 1:204497973-204498820
4 RNA5SP74 Amp 0.125 1:204382589-204792596 1:204531541-204531649
5 RP11-430C7.4 Amp 0.125 1:204382589-204792596 1:204572163-204585693
6 LRRN2 Amp 0.125 1:204382589-204792596 1:204586298-204654861
7 RP11-430C7.5 Amp 0.125 1:204382589-204792596 1:204595903-204598840
8 AL161793.1 Amp 0.125 1:204382589-204792596 1:204622230-204622314
9 RP11-23I7.1 Amp 0.125 1:204382589-204792596 1:204633000-204633741
10 RNA5SP75 Amp 0.125 1:204382589-204792596 1:204676448-204676558
# … with 307 more rows
Below we show how to plot genome mutation data. First, we use the OncoPrint
function of the complexHeatmap
package introduced earlier.
GDCquery_maf
function is used to download mutation data
LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
GBMmut <- GDCquery_Maf(tumor = "GBM", pipelines = "mutect2")
mut <- LGGmut %>% add_row(GBMmut)
save(mut, file ="~/Downloads/mafMutect2LGGGBM.rda")
And extract gene information in glioma-related pathways
# Extract the mutation information of genes in the glioma pathway
Glioma_signaling <- as_tibble(TCGAbiolinks:::listEA_pathways) %>%
filter(grepl("glioma", Pathway, ignore.case = TRUE)) %>%
subset(Pathway == "Glioma Signaling")
# Take only the mutation information of 10 genes
Glioma_signaling_genes <- unlist(str_split(Glioma_signaling$Molecules, ","))[1:10]
mut <- mut[mut$Hugo_Symbol %in% Glioma_signaling_genes,]
# Get the corresponding clinical information
gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical")
lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical")
clinical <- gbm_clin %>% add_row(lgg_clin)
plot diagrams
TCGAvisualize_oncoprint(
mut = mut,
gene = Glioma_signaling_genes,
annotation = clinical[, c("bcr_patient_barcode", "disease", "vital_status", "ethnicity")],
annotation.position = "bottom",
color=c("background"="#CCCCCC", "DEL"="#fb8072", "INS"="#80b1d3", "SNP"="#fdb462"),
label.font.size = 16,
rm.empty.columns = FALSE
)
Genome mutation information can also be displayed using circos graphics, we use the circlize package introduced earlier to display CNV and mutation information
The CNV data is the result of GAIA analysis, and the mutation data uses the result of the MAF file of the LGG sample obtained from the GDC database
First, deal with the mutation data
LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
# Extract the required mutation type
mut_type <- c(
"Missense_Mutation",
"Nonsense_Mutation",
"Nonstop_Mutation",
"Frame_Shift_Del",
"Frame_Shift_Ins"
)
mut <- filter(LGGmut, Variant_Classification %in% mut_type) %>%
mutate(
mut.id = glue("{Chromosome}:{Start_Position}-{End_Position}|{Reference_Allele}"),
.before = Hugo_Symbol
) %>%
# Only consider genes that are mutated in at least two samples
filter(duplicated(mut.id)) %>%
dplyr::select(c("Chromosome","Start_Position","End_Position","Variant_Classification","Hugo_Symbol")) %>%
distinct_all() %>% {
# Use typeNames as a global variable and use it later
typeNames <<- unique(.$Variant_Classification)
type = structure(1:length(typeNames), names = typeNames)
mutate(., type = type[Variant_Classification])
} %>%
dplyr::select(c(1:3,6,4,5))
CNV data processing
load("~/Downloads/GBM_CNV_results.rda")
cnv <- as_tibble(RecCNV) %>%
# The value considers the area less than the threshold, threshold = 0.3
filter(`q-value` <= threshold) %>%
select(c(1, 3, 4, 2)) %>%
# Convert the chromosome name to a string starting with chr
mutate(
Chromosome = ifelse(
Chromosome == 23, "X", ifelse(Chromosome == 24, "Y", Chromosome)
),
Chromosome = paste0("chr", Chromosome)
) %>%
mutate(CNV = 1) %>%
`names<-`(c("Chromosome","Start_position","End_position","Aberration_Kind","CNV"))
plot circos diagram
library(circlize)
par(mar=c(1,1,1,1), cex=1)
circos.initializeWithIdeogram()
# Add CNV result
colors <- c("forestgreen","firebrick")
circos.genomicTrackPlotRegion(
cnv, ylim = c(0, 1.2),
panel.fun = function(region, value, ...) {
circos.genomicRect(
region, value, ytop.column = 2,
ybottom = 0, col = colors[value[[1]] + 1],
border = "white"
)
cell.xlim = get.cell.meta.data("cell.xlim")
circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
}
)
# Add mutation result
colors <- structure(c("blue","green","red", "gold"), names = typeNames[1:4])
circos.genomicTrackPlotRegion(
mut, ylim = c(1.2, 4.2),
panel.fun = function(region, value, ...) {
circos.genomicPoints(
region, value, cex = 0.8, pch = 16,
col = colors[value[[2]]], ...)
}
)
circos.clear()
# Add legend
legend(
-0.2, 0.2, bty = "n", y.intersp = 1,
c("Amp", "Del"), pch = 15,
col = c("firebrick", "forestgreen"),
title = "CNVs", text.font = 1,
cex = 0.8, title.adj = 0
)
legend(
-0.2, 0, bty = "n", y.intersp = 1,
names(colors), pch = 20, col = colors,
title = "Mutations", text.font = 1,
cex = 0.8, title.adj = 0
)
We can limit the scope of the genome to a certain chromosome to better view mutations and copy number variations on that chromosome.
You can use the following graphics to show
par(mar=c(.1, .1, .1, .1),cex=1.5)
circos.par(
"start.degree" = 90, canvas.xlim = c(0, 1),
canvas.ylim = c(0, 1), gap.degree = 270,
cell.padding = c(0, 0, 0, 0),
track.margin = c(0.005, 0.005)
)
circos.initializeWithIdeogram(chromosome.index = "chr17")
circos.par(cell.padding = c(0, 0, 0, 0))
# Add CNV result
colors <- c("forestgreen","firebrick")
names(colors) <- c(0,1)
circos.genomicTrackPlotRegion(
cnv, ylim = c(0, 1.2),
panel.fun = function(region, value, ...) {
circos.genomicRect(
region, value, ytop.column = 2,
ybottom = 0, col = colors[value[[1]]],
border = "white"
)
cell.xlim = get.cell.meta.data("cell.xlim")
circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
}
)
# Add single gene mutation results
gene.mut <- mutate(mut, genes.mut = glue("{Hugo_Symbol}-{type}")) %>%
filter(!duplicated(genes.mut)) %>% {
n.mut <- table(.$genes.mut)
mutate(., num = n.mut[.$genes.mut])
} %>%
mutate(
genes.num = as.character(glue("{Hugo_Symbol}({num})")),
type = type / 2
) %>%
dplyr::select(-(6:8))
colors <- c("blue","green","red","gold")
names(colors) <- typeNames[1:4]
circos.genomicTrackPlotRegion(
gene.mut, ylim = c(0.3, 2.2), track.height = 0.05,
panel.fun = function(region, value, ...) {
circos.genomicPoints(
region, value, cex = 0.4, pch = 16,
col = colors[value[[2]]], ...
)
}
)
circos.genomicTrackPlotRegion(
gene.mut, ylim = c(0, 1), track.height = 0.1, bg.border = NA
)
i_track = get.cell.meta.data("track.index")
circos.genomicTrackPlotRegion(
gene.mut, ylim = c(0, 1),
panel.fun = function(region, value, ...) {
circos.genomicText(
region, value, y = 1, labels.column = 3,
col = colors[value[[2]]], facing = "clockwise",
adj = c(1, 0.5), posTransform = posTransform.text,
cex = 0.4, niceFacing = TRUE
)
},
track.height = 0.1,
bg.border = NA
)
circos.genomicPosTransformLines(
gene.mut,
posTransform = function(region, value)
posTransform.text(
region, y = 1, labels = value[[3]],
cex = 0.4, track.index = i_track + 1
),
direction = "inside",
track.index = i_track
)
circos.clear()
legend(
0, 0.38, bty = "n", y.intersp = 1, c("Amp", "Del"),
pch = 15, col = c("firebrick", "forestgreen"),
title = "CNVs", text.font = 1, cex = 0.7, title.adj = 0
)
legend(
0, 0.2, bty = "n", y.intersp = 1, names(colors),
pch = 16, col = colors, title = "Mutations",
text.font = 1, cex = 0.7, title.adj = 0
)