TCGA data analysis combat-CNV and mutation

created at 07-14-2021 views: 10

Preface

After introducing the query download and data analysis functions of TCGAbiolinks, we briefly show a few examples to practice hands and deepen the understanding and use of this package

We mainly start from:

  • Genome
  • Transcriptome
  • Apparent group

The 3 dimensions are illustrated with examples

Genome analysis

Copy number variation (CNV) plays an important role in the occurrence and development of cancer. Due to genome rearrangement (such as chromosome deletion, duplication, insertion and translocation), resulting in the amplification or censorship of chromosome fragments.

CNV is the amplification and censorship of copy number of genomic regions larger than 1kb.

TCGA's data can be divided into three levels:

  • level 1: original sequencing data (fasta, fastq, etc.)
  • level 2: compared bam files
  • level 3: standardized data

We need to obtain level 3

1. Data preprocessing

We analyze the data of the Affymetrix Genome-Wide Human SNP Array 6.0 platform in the legacy database

Obtain CNV data of 20 GBM and LGG samples data for analysis to identify cancer genome mutations

query.gbm.nocnv <- GDCquery(
  project = "TCGA-GBM",
  data.category = "Copy number variation",
  legacy = TRUE,
  file.type = "nocnv_hg19.seg",
  sample.type = c("Primary Tumor")
)
# Only select 20 samples
query.gbm.nocnv$results[[1]] <- query.gbm.nocnv$results[[1]][1:20,]

GDCdownload(query.gbm.nocnv)

cnvMatrix <- GDCprepare(query.gbm.nocnv)
> head(cnvMatrix)
# A tibble: 6 x 6
  Sample                       Chromosome     Start       End Num_Probes Segment_Mean
  <chr>                        <chr>          <dbl>     <dbl>      <dbl>        <dbl>
1 TCGA-28-2513-01A-01D-0784-01 1            3218610  74709711      40434      -0.0175
2 TCGA-28-2513-01A-01D-0784-01 1           74709724  74715340          3      -1.21  
3 TCGA-28-2513-01A-01D-0784-01 1           74715466 233232858      79323       0.0051
4 TCGA-28-2513-01A-01D-0784-01 1          233233557 233234188          2      -1.56  
5 TCGA-28-2513-01A-01D-0784-01 1          233234531 247813706       9387       0.0168
6 TCGA-28-2513-01A-01D-0784-01 2             484222  20397902      12886       0.0123

2. Identify recurrent CNV

Cancer-related CNVs will appear repeatedly in many samples. We use the GAIA package to identify this remarkable recurrent CNV

The GAIA algorithm is based on the statistical test of random arrangement, while considering the significance and homogeneity within the sample, iteratively deletes the peaks in the interval until the remaining peaks are not significant to identify the significant CNV

This package requires not only the observation data of the input sample, but also the metadata of the genomic probe, which we can download from the broadinstitute website

gdac.root <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/"
file <- paste0(gdac.root, "genome.info.6.0_hg19.na31_minus_frequent_nan_probes_sorted_2.1.txt")
if(!file.exists(basename(file))) 
  downloader::download(file, file.path("~/Downloads", basename(file)))

If the download is wrong, you can manually copy the link to the browser to download

markersMatrix <-  readr::read_tsv(
  file.path("~/Downloads", basename(file)), 
  col_names = FALSE, col_types = "ccn", 
  progress = FALSE
)
> head(markersMatrix)
# A tibble: 6 x 3
  X1        X2       X3
  <chr>     <chr> <dbl>
1 CN_473963 1     61735
2 CN_473964 1     61808
3 CN_473965 1     61823
4 CN_477984 1     62152
5 CN_473981 1     62920
6 CN_473982 1     62937

Process CNV data matrix

cnvMatrix %<>%
  filter(abs(Segment_Mean) > 0.3) %>%
  mutate(label = if_else(Segment_Mean < -0.3, 0, 1)) %>%
  dplyr::select(-Segment_Mean) %>%
  `names<-`(c("Sample.Name", "Chromosome", "Start", "End", "Num.of.Markers", "Aberration")) %>%
  mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome),
         Chromosome = ifelse(Chromosome == "Y", 24, Chromosome),
         Chromosome = as.integer(Chromosome)) %>%
  # Be careful to convert to data.frame format
  as.data.frame()

Download some common CNV area data, you need to filter out these areas

file <- "ftp://ftp.broadinstitute.org/pub/GISTIC2.0/hg19_support/CNV.hg19.bypos.111213.txt"
if(!file.exists(basename(file))) 
  downloader::download(file, file.path("~/Downloads", basename(file)))

commonCNV <- readr::read_tsv(
  file.path("~/Downloads", basename(file)), 
  progress = FALSE
  ) %>%
  mutate(markerID = paste(Chromosome, Start, sep = ":"))

Process probe data

markersMatrix_filtered <- markersMatrix %>%
  `names<-`(c("Probe.Name", "Chromosome", "Start")) %>%
  mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome),
         Chromosome = ifelse(Chromosome == "Y", 24, Chromosome),
         Chromosome = as.integer(Chromosome)) %>%
  mutate(markerID = paste(Chromosome, Start, sep = ":")) %>%
  filter(!duplicated(markerID)) %>%
  # Filter some common CNVs
  filter(!markerID %in% commonCNV$markerID)

Run gaia, identify recurrent CNV

library(gaia)

set.seed(200)
markers_obj <- load_markers(as.data.frame(markersMatrix_filtered))
nbsamples <- length(unique(cnvMatrix$Sample.Name))
cnv_obj <- load_cnv(cnvMatrix, markers_obj, nbsamples)
suppressWarnings({
  cancer <- "GBM"
  results <- runGAIA(
    cnv_obj, markers_obj,
    output_file_name = paste0("~/Downloads/GAIA_", cancer, "_flt.txt"),
    # Specify the type of variation to be analyzed, -1 to analyze all the variations
    aberrations = -1,
    # Specify the chromosomes to be analyzed, the default is -1, which means all chromosomes
    chromosomes = -1,
    # Using approximate methods can speed up the calculation
    approximation = TRUE,
    # Set the number of iterations
    num_iterations = 5000,
    threshold = 0.25
  )
})

plot result

RecCNV <- as_tibble(results) %>%
  mutate_at(vars("q-value"), as.numeric) %>%
  mutate_at(vars(-"q-value"), as.integer) %>% {
    # If q-value is 0, set its value to the smallest non-zero value in the data
    minval = min(.$`q-value`[which(.$`q-value` != 0)])
    mutate(., `q-value` = ifelse(`q-value` == 0, minval, `q-value`))
  } %>%
  mutate(score = -log10(`q-value`)) %>%
  as.data.frame()

threshold <- 0.3
gaiaCNVplot(RecCNV,threshold)
save(results, RecCNV, threshold, file = paste0("~/Downloads/", cancer,"_CNV_results.rda"))

result plot

3. Recurrent CNV gene annotation

After using GAIA to identify the recurring genomic region variation in cancer, we need to annotate it to the corresponding gene.

We use biomaRt to obtain the genomic range information of all human genes, and then map the significantly mutated genomic regions to the corresponding genes

library(GenomicRanges)

# Use biomart to get the gene information in the GENCODE database
genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg19") %>%
  filter(external_gene_name != "" & chromosome_name %in% c(1:22, "X", "Y")) %>%
  mutate(
    chromosome_name = ifelse(
      chromosome_name == "X", 23, ifelse(chromosome_name == "Y", 24, chromosome_name)
    ),
    chromosome_name = as.integer(chromosome_name)
  ) %>%
  arrange(chromosome_name, start_position) %>%
  dplyr::select(c("external_gene_name", "chromosome_name", "start_position","end_position")) %>%
  `names<-`(c("GeneSymbol","Chr","Start","End"))

genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
save(genes_GR, genes, file = "~/Downloads/genes_GR.rda", compress = "xz")

Annotate CNV regions to genes

# Select the required column
sCNV <- dplyr::select(RecCNV, c(1:4, 6)) %>%
  filter(`q-value` <= threshold) %>%
  arrange(1, 3) %>%
  `names<-`(c("Chr","Aberration","Start","End","q-value"))
# Convert to GenomicRanges format
sCNV_GR <- makeGRangesFromDataFrame(sCNV, keep.extra.columns = TRUE)
# Find overlapping intervals
hits <- findOverlaps(genes_GR, sCNV_GR, type = "within")
sCNV_ann <- cbind(sCNV[subjectHits(hits),], genes[queryHits(hits),])

library(glue)
# Format output content
AmpDel_genes <- as_tibble(sCNV_ann, .name_repair = "universal") %>%
  mutate(
    AberrantRegion = glue("{Chr...1}:{Start...3}-{End...4}"),
    GeneRegion = glue("{Chr...7}:{Start...8}-{End...9}")
  ) %>%
  dplyr::select(c(6, 2, 5, 10, 11)) %>%
  mutate(Aberration = if_else( Aberration == 0, "Del", "Amp"))

save(RecCNV, AmpDel_genes, file = paste0("~/Downloads/", cancer, "_CNV_results.rda"))

The output is as follows

> AmpDel_genes
# A tibble: 317 x 5
   GeneSymbol   Aberration q.value AberrantRegion        GeneRegion           
   <chr>        <chr>        <dbl> <glue>                <glue>               
 1 PIK3C2B      Amp          0.125 1:204382589-204792596 1:204391756-204463852
 2 MDM4         Amp          0.125 1:204382589-204792596 1:204485511-204542871
 3 RP11-430C7.2 Amp          0.125 1:204382589-204792596 1:204497973-204498820
 4 RNA5SP74     Amp          0.125 1:204382589-204792596 1:204531541-204531649
 5 RP11-430C7.4 Amp          0.125 1:204382589-204792596 1:204572163-204585693
 6 LRRN2        Amp          0.125 1:204382589-204792596 1:204586298-204654861
 7 RP11-430C7.5 Amp          0.125 1:204382589-204792596 1:204595903-204598840
 8 AL161793.1   Amp          0.125 1:204382589-204792596 1:204622230-204622314
 9 RP11-23I7.1  Amp          0.125 1:204382589-204792596 1:204633000-204633741
10 RNA5SP75     Amp          0.125 1:204382589-204792596 1:204676448-204676558
# … with 307 more rows

4. Genomic variation visualization

4.1 OncoPrint

Below we show how to plot genome mutation data. First, we use the OncoPrint function of the complexHeatmap package introduced earlier.

GDCquery_maf function is used to download mutation data

LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
GBMmut <- GDCquery_Maf(tumor = "GBM", pipelines = "mutect2")
mut <- LGGmut %>% add_row(GBMmut)
save(mut, file ="~/Downloads/mafMutect2LGGGBM.rda")

And extract gene information in glioma-related pathways

# Extract the mutation information of genes in the glioma pathway
Glioma_signaling <- as_tibble(TCGAbiolinks:::listEA_pathways) %>%
  filter(grepl("glioma", Pathway, ignore.case = TRUE)) %>%
  subset(Pathway == "Glioma Signaling")
# Take only the mutation information of 10 genes
Glioma_signaling_genes <- unlist(str_split(Glioma_signaling$Molecules, ","))[1:10]
mut <- mut[mut$Hugo_Symbol %in% Glioma_signaling_genes,]
# Get the corresponding clinical information
gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical")
lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical")
clinical <- gbm_clin %>% add_row(lgg_clin)

plot diagrams

TCGAvisualize_oncoprint(
  mut = mut,
  gene = Glioma_signaling_genes,
  annotation = clinical[, c("bcr_patient_barcode", "disease", "vital_status", "ethnicity")],
  annotation.position = "bottom",
  color=c("background"="#CCCCCC", "DEL"="#fb8072", "INS"="#80b1d3", "SNP"="#fdb462"),
  label.font.size = 16,
  rm.empty.columns = FALSE
)

OncoPrint

4.2 circos plot

Genome mutation information can also be displayed using circos graphics, we use the circlize package introduced earlier to display CNV and mutation information

The CNV data is the result of GAIA analysis, and the mutation data uses the result of the MAF file of the LGG sample obtained from the GDC database

First, deal with the mutation data

LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "mutect2")
# Extract the required mutation type
mut_type <- c(
  "Missense_Mutation",
  "Nonsense_Mutation",
  "Nonstop_Mutation",
  "Frame_Shift_Del",
  "Frame_Shift_Ins"
)
mut <- filter(LGGmut, Variant_Classification %in% mut_type) %>%
  mutate(
    mut.id = glue("{Chromosome}:{Start_Position}-{End_Position}|{Reference_Allele}"),
    .before = Hugo_Symbol
  ) %>%
 # Only consider genes that are mutated in at least two samples
  filter(duplicated(mut.id)) %>%
  dplyr::select(c("Chromosome","Start_Position","End_Position","Variant_Classification","Hugo_Symbol")) %>%
  distinct_all() %>% {
   # Use typeNames as a global variable and use it later
    typeNames <<- unique(.$Variant_Classification)
    type = structure(1:length(typeNames), names = typeNames)
    mutate(., type = type[Variant_Classification])
  } %>%
  dplyr::select(c(1:3,6,4,5))

CNV data processing

load("~/Downloads/GBM_CNV_results.rda")

cnv <- as_tibble(RecCNV) %>%
  # The value considers the area less than the threshold, threshold = 0.3
  filter(`q-value` <= threshold) %>%
  select(c(1, 3, 4, 2)) %>%
  # Convert the chromosome name to a string starting with chr
  mutate(
    Chromosome = ifelse(
      Chromosome == 23, "X", ifelse(Chromosome == 24, "Y", Chromosome)
    ),
    Chromosome = paste0("chr", Chromosome)
  ) %>%
  mutate(CNV = 1) %>%
  `names<-`(c("Chromosome","Start_position","End_position","Aberration_Kind","CNV"))

plot circos diagram

library(circlize)

par(mar=c(1,1,1,1), cex=1)
circos.initializeWithIdeogram()
# Add CNV result
colors <- c("forestgreen","firebrick")
circos.genomicTrackPlotRegion(
  cnv, ylim = c(0, 1.2),
  panel.fun = function(region, value, ...) {
    circos.genomicRect(
      region, value, ytop.column = 2,
      ybottom = 0, col = colors[value[[1]] + 1],
      border = "white"
    )
    cell.xlim = get.cell.meta.data("cell.xlim")
    circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
  }
)
# Add mutation result
colors <- structure(c("blue","green","red", "gold"), names = typeNames[1:4])
circos.genomicTrackPlotRegion(
  mut, ylim = c(1.2, 4.2),
  panel.fun = function(region, value, ...) {
    circos.genomicPoints(
      region, value, cex = 0.8, pch = 16, 
      col = colors[value[[2]]], ...)
  }
)
circos.clear()

# Add legend
legend(
  -0.2, 0.2, bty = "n", y.intersp = 1,
  c("Amp", "Del"), pch = 15,
  col = c("firebrick", "forestgreen"),
  title = "CNVs", text.font = 1,
  cex = 0.8, title.adj = 0
)
legend(
  -0.2, 0, bty = "n", y.intersp = 1,
  names(colors), pch = 20, col = colors,
  title = "Mutations", text.font = 1,
  cex = 0.8, title.adj = 0
)

circos diagram

4.3 Partial area visualization

We can limit the scope of the genome to a certain chromosome to better view mutations and copy number variations on that chromosome.

You can use the following graphics to show

par(mar=c(.1, .1, .1, .1),cex=1.5)
circos.par(
  "start.degree" = 90, canvas.xlim = c(0, 1),
  canvas.ylim = c(0, 1), gap.degree = 270,
  cell.padding = c(0, 0, 0, 0), 
  track.margin = c(0.005, 0.005)
)
circos.initializeWithIdeogram(chromosome.index = "chr17")
circos.par(cell.padding = c(0, 0, 0, 0))
# Add CNV result
colors <- c("forestgreen","firebrick")
names(colors)  <- c(0,1)
circos.genomicTrackPlotRegion(
  cnv, ylim = c(0, 1.2),
  panel.fun = function(region, value, ...) {
    circos.genomicRect(
      region, value, ytop.column = 2,
      ybottom = 0, col = colors[value[[1]]],
      border = "white"
    )
    cell.xlim = get.cell.meta.data("cell.xlim")
    circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040")
  }
)

# Add single gene mutation results
gene.mut <- mutate(mut, genes.mut = glue("{Hugo_Symbol}-{type}")) %>%
  filter(!duplicated(genes.mut)) %>% {
    n.mut <- table(.$genes.mut)
    mutate(., num = n.mut[.$genes.mut])
  } %>%
  mutate(
    genes.num = as.character(glue("{Hugo_Symbol}({num})")),
    type = type / 2
  ) %>%
  dplyr::select(-(6:8))

colors <- c("blue","green","red","gold")
names(colors)  <- typeNames[1:4]
circos.genomicTrackPlotRegion(
  gene.mut, ylim = c(0.3, 2.2), track.height = 0.05,
  panel.fun = function(region, value, ...) {
    circos.genomicPoints(
      region, value, cex = 0.4, pch = 16,
      col = colors[value[[2]]], ...
    )
  }
)

circos.genomicTrackPlotRegion(
  gene.mut, ylim = c(0, 1), track.height = 0.1, bg.border = NA
)
i_track = get.cell.meta.data("track.index")

circos.genomicTrackPlotRegion(
  gene.mut, ylim = c(0, 1),
  panel.fun = function(region, value, ...) {
    circos.genomicText(
      region, value, y = 1, labels.column = 3,
      col = colors[value[[2]]], facing = "clockwise",
      adj = c(1, 0.5), posTransform = posTransform.text,
      cex = 0.4, niceFacing = TRUE
    )
  },
  track.height = 0.1,
  bg.border = NA
)

circos.genomicPosTransformLines(
  gene.mut,
  posTransform = function(region, value)
    posTransform.text( 
      region,  y = 1, labels = value[[3]],
      cex = 0.4, track.index = i_track + 1
    ),
  direction = "inside",
  track.index = i_track
)

circos.clear()

legend(
  0, 0.38, bty = "n", y.intersp = 1, c("Amp", "Del"),
  pch = 15, col = c("firebrick", "forestgreen"),
  title = "CNVs", text.font = 1, cex = 0.7, title.adj = 0
)
legend(
  0, 0.2, bty = "n", y.intersp = 1, names(colors),
  pch = 16, col = colors, title = "Mutations",
  text.font = 1, cex = 0.7, title.adj = 0
)

final result

created at:07-14-2021
edited at: 07-14-2021: