library(Gviz)
library(GenomicRanges)
library(GenomicFeatures)
library(rtracklayer)
library(dplyr)
library(data.table)
source("~/projects/funcFinemapping/code/run_susie.R")
annoDIR <- "~/cluster/data/features/raw/"
build_grtrack <- function(curr.locus.gr,
genome="hg19",
start.pos = start,
end.pos = end
){
## match gene ids in TxDb with gene symbols if available
## Otherwise, return transcript symbol
ga.track <- GenomeAxisTrack()
if(genome == "hg19"){
#txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
txdb <- loadDb("~/resources/gencode/gencode_v43lift37.txdb")
}
if(genome == "hg38"){
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
}
# gene track
grtrack <- GeneRegionTrack(range = txdb,
genome = genome,
chromosome = as.character(seqnames(curr.locus.gr)),
name = "Genes",
collapse = TRUE
)
# subsetting the grtrack to keep all transcripts regardless of whether they have gene annotations
grtrack <- subset(grtrack,
from = start.pos,
to = end.pos)
# match gene IDs with gene symbols
symbolList <- mapIds(org.Hs.eg.db::org.Hs.eg.db,
keys=sub("\\.\\d+$", "",
sub("\\_\\d+$", "",
ranges(grtrack)$gene)),
keytype="ENSEMBL", column="SYMBOL")
# return transcript symbol if no gene symbol available
symbol(grtrack) <- unlist(lapply(1:length(symbolList), function(i){
if(is.na(symbolList[[i]])){
return(symbol(grtrack)[i])
}else{
return(symbolList[[i]])
}
}))
return(grtrack)
}
make_track_plots <- function(dsRNA.locus,
extend.size=0,
reverse.strand = FALSE,
to.highligh = TRUE,
collapse.type = "meta"
){
# build GRanges for each track
m6a_peaks<-GRanges(seqnames = unique(dsRNA.locus$chr),
IRanges(start = unique(dsRNA.locus$start),
end = unique(dsRNA.locus$end)),
strand = unique(dsRNA.locus$strand)
)
gene_query <- range(c(
GRanges(dsRNA.locus$DS_query_gene_gr),
GRanges(dsRNA.locus$DS_ref_gene_gr)
), ignore.strand=TRUE)
#print("Here is the targeted gene region:\n")
#print(gene_query)
dsRNA <- unique(GRanges(dsRNA.locus %>%
select(c(DS_chr, DS_start, DS_end))
))
## annotation tracks
m6a_track <- AnnotationTrack(m6a_peaks, genome = "hg19", name = "m6A")
dsRNA_track <- AnnotationTrack(dsRNA, genome = "hg19", name = "dsRNA", stacking = "dense")
ed_track <- AnnotationTrack(
ed_gr[seqnames(ed_gr) == seqnames(m6a_track), ],
genome = "hg19",
name = "Geuvadis LCLs",
stacking = "dense"
)
RADAR_track <- AnnotationTrack(
ed_gr[seqnames(ed_gr) == seqnames(m6a_track) &
ed_gr$RADAR == 1, ],
genome = "hg19",
name = "RADAR",
stacking = "dense"
)
ed_atlas_subset<- ed_atlas_gr[seqnames(ed_atlas_gr) == as.character(seqnames(gene_query)) &
start(ed_atlas_gr) >= start(gene_query) &
start(ed_atlas_gr) < end(gene_query)]
ED_atlas_track <- AnnotationTrack(
ed_atlas_subset,
genome = "hg19",
name = "ED Atlas",
stacking = "dense",
group = ed_atlas_subset$type
)
feature(ED_atlas_track)<-ed_atlas_subset$type
GTEx_Ed_track <- AnnotationTrack(
ed_gtex_gr[seqnames(ed_gtex_gr) == seqnames(m6a_track), ],
genome = "hg19",
name = "GTEx LCLs",
stacking = "dense"
)
## genomic coordinates
gtrack <- GenomeAxisTrack()
## gene tracks
grtrack<-build_grtrack(curr.locus.gr = gene_query,
start = start(gene_query),
end = end(gene_query),
genome = "hg19")
# highlight a particular SNP
trackList <-list(
gtrack,grtrack,
m6a_track,
dsRNA_track,
ed_track,
RADAR_track,
ED_atlas_track,
GTEx_Ed_track
)
if(to.highligh){
ht1 <- HighlightTrack(trackList,
start = unique(dsRNA.locus$physical.pos), width = 1,
chromosome = as.character(seqnames(gene_query)),
col = 'pink')
dist <- unique(dsRNA.locus$physical.pos) - end(gene_query)
if(dist > extend.size){
print(sprintf("Extend the window size to be longer than %d.",
dist))
}
}else{
ht1 <- trackList
}
plotTracks(ht1,
transcriptAnnotation = 'symbol',
from = start(gene_query) - extend.size,
to = end(gene_query) + extend.size,
collapseTranscripts = collapse.type,
just.group = "right",
groupAnnotation="type", ALU="darkgreen", REP="blue", NONREP="darkred",
reverseStrand = reverse.strand
)
}
RNA editing sites
Editing sites from Geuvadis LCLs:
Mai et al.
integrated genome and transcriptome squencing data from Geuvadis LCLs of
447 individuals (derived from 1000 Genomes Project) to detect A-to-I
editing sites. For each A/G SNP site, individuals with the homozygous
genotype AA were considered to determine RNA editing. An A/G SNP site
was further defined as a SNP editing site if it was found to be edited
at a level >5% in at least two LCL samples from individuals with
homozygous genotype AA. This leads to the detection of more than 80K
edited sites.
RADAR database:
Ramaswami
et al. built a rigorous database for RNA A-to-I editing sites
detected in humans. This database not only includes extensive manually
curated annotations for each editing site, but also tissue-specific
editing levels for each editing site.
Editing sites from GTEx LCLs:
Li et
al. detected editing sites using the GTEx gene expression data. The
list of edited sites for LCLs were obtained from GTEx portal (GTEx
Analysis V8 release).
REDIportal V2.0: Picardi et al. produced an RNA editing atlas in humans across 55 human body sites using GTEx RNAseq experiments as well as their own experiments.
## GTEx dataset
# ed_sites <- read.csv(paste0(annoDIR, "RNA_editing/Tan2017/GTEx_avg_RNA_editing_levels_by_tissue.csv"))
#
# ed_sites[, c("chr", "pos")]<-t(sapply(ed_sites$sites,
# function(i){strsplit(i, "_")[[1]]}
# ))
## Mai2019 +400 LCLs
ed_sites <- readxl::read_excel(paste0(annoDIR,
"RNA_editing/Mai2019/RNA_editing_sites_LCLs.xlsx"),
skip = 2)
## Picardi2017: RNA editing atlas
ed_atlas <- fread("../data/TABLE1_hg19.txt")
## Warning in fread("../data/TABLE1_hg19.txt"): Detected 25 column names but the
## data has 22 columns. Filling rows automatically. Set fill=TRUE explicitly to
## avoid this warning.
ed_atlas_gr <- GRanges(seqnames = ed_atlas$Region,
IRanges(start = ed_atlas$Position,
end = ed_atlas$Position),
type = ed_atlas$type)
## Li2022 Edsites for GTEx
ed_gtex <- read.table(paste0(annoDIR,
"RNA_editing/Li2022/Cells-EBV-transformed-lymphocytes.edsite.txt.gz"),
header = T)
ed_gtex_gr_hg38 <- GRanges(seqnames = ed_gtex$chr,
IRanges(start = ed_gtex$variant_pos,
end = ed_gtex$variant_pos),
snpID = ed_gtex$rs_id_dbSNP151_GRCh38p7
)
## Li2022 dsRNAs:
ds_data <- readxl::read_excel("../data/Li2022_dsRNAs.xlsx", skip=10)
ds_rnas <- unlist(lapply(unique(ds_data$`cis-NATs`), function(i){strsplit(i, ",")[[1]]}))
m6a_twas <- read.csv("../data/m6A_TWAS_results.csv")
m6a_immuno_dsRNA <- ds_data[ds_data$`cis-NATs` %in% intersect(m6a_twas$GENE, ds_rnas) &
ds_data$tissue == "Cells-EBV-transformed-lymphocytes", ]
snp_info <- data.frame(t(
sapply(
m6a_immuno_dsRNA$ref_SNP, function(i){
items <- strsplit(i, "_")[[1]]
return(c(paste0("chr", items[1]), items[2]))
}
)
), row.names = NULL)
colnames(snp_info) <- c("chr", "pos")
m6a_immuno_dsRNA <- data.frame(snp_info, m6a_immuno_dsRNA)
targets_gr_hg38 <- GRanges(seqnames = m6a_immuno_dsRNA$chr,
IRanges(start = m6a_immuno_dsRNA$pos),
cisNATs = m6a_immuno_dsRNA$cis.NATs,
traits = m6a_immuno_dsRNA$GWAS_trait,
pp4 = m6a_immuno_dsRNA$clpp_h4,
)
## liftover to hg19
chain.file.path <- "~/resources/genomes/chain_files/hg38ToHg19.over.chain"
c<-import.chain(chain.file.path)
targets_gr <- unlist(liftOver(targets_gr_hg38, c))
The features on the plot from top to bottom are genes, m6a peaks, dsRNA regions, RNA editing sites detected in >400 LCLs and the subset of editing sites that were also found in RADAR database. The red line highlights the position for the top m6A QTL.
rs3204541 - DDX55
DDX55 has the molecular function of RNA Binding (GO:0003723). DDX55 is found in Nuclear Lumen (GO:0031981).
DDX55 is found in Nucleolus (GO:0005730).
DDX55 is upregulated in 721 B lymphoblasts cells.
EIF2B1 is involved in the biological process T Cell Receptor
Signaling Pathway (GO:0050852).
EIF2B1 is involved in the biological process Antigen Receptor-Mediated
Signaling Pathway (GO:0050851).
EIF2B1 has the molecular function of GTPase Regulator Activity (GO:0030695).
EIF2B1 has the molecular function of Guanyl-Nucleotide Exchange Factor
Activity (GO:0005085).
EIF2B1 has the molecular function of Translation Initiation Factor
Activity (GO:0003743).
## 'select()' returned 1:1 mapping between keys and columns
rs1806261 - RABEP1
RABEP1 has the molecular function of Protein Homodimerization
Activity (GO:0042803).
RABEP1 is involved in the biological process Golgi To Plasma Membrane
Transport (GO:0006893).
RABEP1 is involved in the biological process Membrane Fusion (GO:0061025).
RABEP1 is involved in the biological process Membrane Organization (GO:0061024).
RABEP1 is involved in the biological process post-Golgi Vesicle-Mediated
Transport (GO:0006892).
RABEP1 is involved in the biological process Protein Localization To
Cell Periphery (GO:1990778).
## 'select()' returned 1:1 mapping between keys and columns
rs3177647 - MAPKAPK5
MAPKAPK5 has the molecular function of MAP Kinase Kinase Activity (GO:0004708).
MAPKAPK5 has the molecular function of Calcium-Dependent Protein Kinase
Activity (GO:0010857).
MAPKAPK5 has the molecular function of Calcium-Dependent Protein
Serine/Threonine Kinase Activity
## 'select()' returned 1:1 mapping between keys and columns
rs1056879 - PPP2R3C
PPP2R3C is involved in the biological process B Cell Homeostasis (GO:0001782).
PPP2R3C is involved in the biological process T Cell Homeostasis (GO:0043029).
PPP2R3C is involved in the biological process Positive Regulation Of B
Cell Activation (GO:0050871).
PPP2R3C is involved in the biological process Positive Regulation Of B
Cell Differentiation (GO:0045579).
PPP2R3C is involved in the biological process Positive Regulation Of
Lymphocyte Differentiation (GO:0045621).
PPP2R3C is involved in the biological process Regulation Of B Cell
Differentiation (GO:0045577).
PRORP is involved in the biological process Mitochondrial tRNA
Processing (GO:0090646).
PRORP is involved in the biological process tRNA 5’-End Processing (GO:0099116).
PRORP is involved in the biological process tRNA 5’-Leader Removal (GO:0001682).
PRORP is found in Intracellular Organelle Lumen (GO:0070013).
PRORP is found in Mitochondrial Matrix (GO:0005759).
PRORP has the molecular function of tRNA-specific Ribonuclease Activity
(GO:0004549).
## 'select()' returned 1:many mapping between keys and columns