将ENSEMBL ID转换为数据框中的基因ID

问题描述 投票:0回答:1

我有一个大的RNA-seq数据表,由ensembl_gene_id列出,但我想转换为hgnc_symbol,以便于在热图上直观显示。

到目前为止,我有以下代码-但不确定如何继续。从头开始转换名称还是仅对子集数据进行转换会更好?

我也更熟悉python,通常我会使用字典来映射ensembl_gene_id和hgnc_symbol,但是在R中,不确定如何执行此操作。我的直觉说for循环是不可扩展的。

任何建议将不胜感激。

library(biomaRt)
library(RColorBrewer)
#Load ggplot2 for graphing
#library(ggplot2)

#Load the Gene Expression File. This one is MEAN TPM for genes across cell types.
GE_file <- read.csv(file = "mean_tpm_merged.csv")

#Get the header names of this file
headers <- names(GE_file)

# define biomart object
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

# query biomart

#Define Genes of Interest
GOI <- c("TFEB", "RAC1", "TFE3", "RAB5A")

# get the mapping of GOI and ENSEMBL IDs and create a dictionary
IDs <- getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
                 filters = "hgnc_symbol", values = GOI,
                 mart = mart)

# make the row names the ENSMBL IDs
row.names(IDs) <- IDs[,2]

# Look by rows of interest for this data out of the large dataset
Data_subset <- subset(GE_file, gene %in% IDs$ensembl_gene_id)

# make the row names ENSMBL IDs
row.names(Data_subset) <- Data_subset[,1]

# delete the first row as its not needed for the numerical matrix
Data_subset_matrix <- as.matrix(Data_subset[,2:16])

# colors should be green/red if possible, or whatever is color blind compatible.
# should go row-wise for the coloring.
# excise colors for B cells/NK cells/CD8 T cells.
my_palette <- colorRampPalette(c("red","green"))(n = 299)
heatmap(Data_subset_matrix, Colv = NA, Rowv = NA, scale = 'row', col = my_palette)

一些相关输出:

> dput(head(GE_file))
structure(list(gene = c("ENSG00000223116", "ENSG00000233440", 
"ENSG00000207157", "ENSG00000229483", "ENSG00000252952", "ENSG00000235205"
), T.cell..CD4..naive..activated. = c(0, 0.0034414596504, 0, 
0, 0, 0), NK.cell..CD56dim.CD16. = c(0, 0, 0, 0, 0, 0.0139463278778
), T.cell..CD4..TFH = c(0, 0, 0, 0, 0, 0), T.cell..CD4..memory.TREG = c(0, 
0, 0, 0, 0, 0.000568207845073), T.cell..CD4..TH1.17 = c(0, 0.0196376949773, 
0, 0, 0, 0), B.cell..naive = c(0, 0, 0, 0, 0, 0), T.cell..CD4..TH2 = c(0, 
0, 0, 0, 0, 0), T.cell..CD4..TH1 = c(0, 0, 0, 0, 0, 0.000571213481481
), T.cell..CD4..naive = c(0, 0, 0, 0, 0, 0), T.cell..CD4..TH17 = c(0, 
0.00434618468012, 0, 0, 0, 0), Monocyte..classical = c(0, 0, 
0, 0, 0, 0), Monocyte..non.classical = c(0, 0, 0, 0, 0, 0), T.cell..CD4..naive.TREG = c(0, 
0, 0, 0, 0, 0.000821516453853), T.cell..CD8..naive = c(0, 0, 
0, 0, 0, 0.000508869486411), T.cell..CD8..naive..activated. = c(0, 
0.00348680689669, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame")

r bioinformatics bioconductor biomart
1个回答
0
投票

一次性获得所有内容:

mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

IDs <- getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
                 filters = "ensembl_gene_id", values = GE_file[,1],
                 mart = mart)

head(IDs)
  ensembl_gene_id hgnc_symbol
1 ENSG00000207157      RNY3P4
2 ENSG00000229483   LINC00362
3 ENSG00000233440     HMGA1P6
4 ENSG00000235205    TATDN2P3
5 ENSG00000252952    RNU6-58P

GOI <- c("RNY3P4", "TATDN2P3")

简单的方法,将主表中的集成ID子集化,并根据该子集化数据集:

GOI_ens = IDs$ensembl_gene_id[IDs$hgnc_symbol %in% GOI]

Data_subset = subset(GE_file,gene %in% GOI_ens)[,-1]

字典方式,总可以做些什么,但是您需要确保没有重复的符号:

dedup = !duplicated(IDs$hgnc_symbol)
dict = tapply(IDs$hgnc_symbol,IDs$ensembl_gene_id,unique)
subset(GE_file,dict[gene] %in% GOI)
© www.soinside.com 2019 - 2024. All rights reserved.