//#我有一个 fASTA 文件,我想把它放在一个有 4 列的表中
#代码 安装.packages(“seqinr”)
library("seqinr")
dnaseq<- read.fasta(file = "C:/Users/user/Downloads/my.fasta")
rows = 4:nrow(dnaseq)
dff<-data.frame(ID = gsub(">","",as.character(dnaseq[rows %% 4==3,3])),names= dnaseq[rows %% 4==2,3],specie= dnaseq[rows %% 4==1,3], sequences = dnaseq[rows %% 4==0,3])
#rslt 查看(dff) 表中无可用数据
#我也试过了 库(phylotools) fasta.df = read.fasta("C:/Users/user/Downloads/my.fasta",names=['ID','names','specie','sequence']) #结果
fasta.df = read.fasta("C:/Users/user/Downloads/my.fasta",names=['ID','names','specie','sequence']) 错误:“fasta.df = read.fasta("C:/Users/user/Downloads/my.fasta",names=["
我希望你已经从gencode
下载了你的fasta文件Fasta 文件看起来像这样
>ENST00000456328.2|ENSG00000223972.5|OTTHUMG00000000961|OTTHUMT00000362751.1|DDX11L1-202|DDX11L1|1657|processed_transcript|
GTTAACTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTATTTGCTGTC
>ENST00000450305.2|ENSG00000223972.5|OTTHUMG00000000961|OTTHUMT00000002844.1|DDX11L1-201|DDX11L1|632|transcribed_unprocessed_pseudogene|
GTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTGCAAGCTGAGTTGGAGGAAAGA
.
.
加载需要的包
#install.packages("seqinr")
library(seqinr)
#install.packages("tidyverse")
library(tidyverse)
首先我们将fasta文件读入R
gencode_fasta <- read.fasta(file = "Path/to/my/fasta/file.fa" )
提取头部信息到数据框中
header_df=as.data.frame(names(gencode_fasta))
将列名更改为 target_id
names(header_df)=c("target_id")
将标题信息拆分为多列
header_df=separate(header_df, col=target_id, into=c("transcript","ens_gene","OG","OT","c5","ext_gene","num","type","unknown"), sep="\\|", remove = FALSE)
仅选择所需的列
my_gene_df=header_df[,c("ext_gene","ens_gene","target_id")]