transcript整合为gene表达值，tximport到底干了啥…

Alex • 2024-07-11 21:42 • 杂文

有些问题来来回回，学了忘，忘了学，反反复复很多次，用到时依然是模糊的。之所以被遗忘，也许是问题本身不重要，没有什么存在感；也许是要学的东西很多，脑容量不够。Anyway，既然记不住，那只能动动手记下来了，以便日后疑惑时有迹可循。

RNA-seq数据定量流程使用的软件一直是kallisto，用过的人都知道其定量的结果为转录本表达值，可是通常情况下，大家需要的还是基因层面的表达量，因此，转录本定量整合为基因表达值无可避免。

整合可以借助R包tximport轻松完成，用过的都知道该包可以轻松搞定Salmon、Alevin、kallisto、RSEM、StringTie软件结果的整合，几行代码运行下来便将转录本表达值整合为基因水平。

library(tximport)

files <- list.files('result', '*.tsv', full.names=T, recursive=T)
names(files) <- sapply(strsplit(basename(files), split='//.'), function(x) x[1])
files
                                ctrl1
    "result/ctrl1/kallisto/ctrl1.tsv"                                                                                                            
                                ctrl2
    "result/ctrl2/kallisto/ctrl2.tsv"                                                                                                             
                                ctrl3
    "result/ctrl3/kallisto/ctrl3.tsv"                                                                                                        
                              stroke1
"result/stroke1/kallisto/stroke1.tsv"                                                                                                        
                              stroke2
"result/stroke2/kallisto/stroke2.tsv"                                                                                                        
                              stroke3
"result/stroke3/kallisto/stroke3.tsv"

tx2gene <- read.table('gencode_hg38_tx2g.txt', header=T, sep="/t")
head(tx2gene)
               ENST              ENSG
1 ENST00000456328.2 ENSG00000223972.5
2 ENST00000450305.2 ENSG00000223972.5
3 ENST00000488147.1 ENSG00000227232.5
4 ENST00000619216.1 ENSG00000278267.1
5 ENST00000473358.1 ENSG00000243485.5
6 ENST00000469289.1 ENSG00000243485.5

txi <- tximport(files, type="kallisto", tx2gene=tx2gene, ignoreAfterBar=T)
str(txi)
List of 4
 $ abundance          : num [1:60669, 1:6] 0.145 0 4.455 3.331 1.435 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:60669] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.12" "ENSG00000000457.14" ...
  .. ..$ : chr [1:6] "ctrl1" "ctrl2" "ctrl3" "stroke1" ...
 $ counts             : num [1:60669, 1:6] 34.6 0 253 618.7 262.2 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:60669] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.12" "ENSG00000000457.14" ...
  .. ..$ : chr [1:6] "ctrl1" "ctrl2" "ctrl3" "stroke1" ...
 $ length             : num [1:60669, 1:6] 3516 642 835 2732 2689 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:60669] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.12" "ENSG00000000457.14" ...
  .. ..$ : chr [1:6] "ctrl1" "ctrl2" "ctrl3" "stroke1" ...
 $ countsFromAbundance: chr "no"

整合时需要一个tx2gene表格，内容为转录本id与基因id的对应关系。tximport整合生成的是一个列表，里面包含四个结果，前面三个分别是TPM、原始count、基因长度。countsFromAbundance指定整合原始count的方法，默认为no。

虽然整合的流程做起来很轻松，但背后具体是怎么将转录本表达值合并为基因水平的呢？基因的表达值是不是转录本表达值的简单加和？下面一起来拭目以待。如果countsFromAbundance选定了某个方法，则用矫正后的abundance去估计原始count，然后整合为基因水平。如没有选择矫正方法，则直接整合为基因水平。下面以样本ctrl1为例，演示一下默认情况下整合的核心过程：

ctrl1 <- read.table("result/ctrl1/kallisto/ctrl1.tsv", stringsAsFactors=F, header=T, sep='/t')
ctrl1$target_id <- sapply(strsplit(ctrl1$target_id, split='//|'), function(x) x[2])
head(ctrl1)
          target_id length eff_length est_counts      tpm
1 ENSG00000223972.5   1657   1405.040   166.5320 1.743560
2 ENSG00000223972.5    632    380.273     0.0000 0.000000
3 ENSG00000227232.5   1351   1099.040    45.7171 0.611919
4 ENSG00000278267.1     68     25.760     0.0000 0.000000
5 ENSG00000243485.5    712    460.158     0.0000 0.000000
6 ENSG00000243485.5    535    283.436     0.0000 0.000000

abundanceMat <- rowsum(ctrl1[,'tpm'], ctrl1$target_id)
countsMat <- rowsum(ctrl1[,'est_counts'], ctrl1$target_id)

weightedLength <- rowsum(ctrl1[,'tpm'] * ctrl1[,'eff_length'], ctrl1$target_id)
lengthMat <- weightedLength/abundanceMat

从计算过程可知，默认情况下直接将各转录本加和作为基因的表达值，基因长度则为加权平均值。下面可以对比一下结果：

# abundance
identical(abundanceMat[,1], txi$abundance[,'ctrl1'])
[1] TRUE

head(abundanceMat)
                         [,1]
ENSG00000000003.15   0.144553
ENSG00000000005.6    0.000000
ENSG00000000419.12   4.454973
ENSG00000000457.14   3.331466
ENSG00000000460.17   1.434763
ENSG00000000938.13 141.169300

head(txi$abundance[,'ctrl1', drop=F])
                        ctrl1
ENSG00000000003.15   0.144553
ENSG00000000005.6    0.000000
ENSG00000000419.12   4.454973
ENSG00000000457.14   3.331466
ENSG00000000460.17   1.434763
ENSG00000000938.13 141.169300

# count
identical(countsMat[,1], txi$counts[,'ctrl1'])
[1] TRUE

head(countsMat)
                         [,1]
ENSG00000000003.15    34.5503
ENSG00000000005.6      0.0000
ENSG00000000419.12   253.0003
ENSG00000000457.14   618.7436
ENSG00000000460.17   262.2473
ENSG00000000938.13 17233.0064

head(txi$counts[,'ctrl1', drop=F])
                        ctrl1
ENSG00000000003.15    34.5503
ENSG00000000005.6      0.0000
ENSG00000000419.12   253.0003
ENSG00000000457.14   618.7436
ENSG00000000460.17   262.2473
ENSG00000000938.13 17233.0064

# length
head(lengthMat)
                        [,1]
ENSG00000000003.15 3516.0400
ENSG00000000005.6        NaN
ENSG00000000419.12  835.4205
ENSG00000000457.14 2732.1483
ENSG00000000460.17 2688.8169
ENSG00000000938.13 1795.7674

head(txi$length[,'ctrl1', drop=F])
                       ctrl1
ENSG00000000003.15 3516.0400
ENSG00000000005.6   642.3992
ENSG00000000419.12  835.4205
ENSG00000000457.14 2732.1483
ENSG00000000460.17 2688.8169
ENSG00000000938.13 1795.7674

可以看到abundance和count的结果与软件完全一致，length除了有些缺失值，其余也一致。整合多个样本时，软件对于长度缺失值的处理如下：

aveLengthSamp <- rowMeans(lengthMatTx)
aveLengthSampGene <- tapply(aveLengthSamp, geneId, mean)
lengthMat <- replaceMissingLength(lengthMat, aveLengthSampGene)

先计算样本间每个转录本长度的平均值，然后基于计算每个基因所有转录本的长度的平均值，作为基因的长度，用于填充缺失值。

版权声明：
作者：Alex
链接：https://www.techfm.club/p/137493.html
来源：TechFM
文章版权归作者所有，未经允许请勿转载。

THE END

代码

二维码

软文发稿技巧：如何选择合适的发稿平台

< <上一篇

半决赛硝烟起，阿根廷捍卫荣耀，加拿大黑马奇迹能否续写？

下一篇>>

搜索内容

transcript整合为gene表达值，tximport到底干了啥…

取消回复

共有 0 条评论

Ads