TCGA学习笔记7-绘制热图

上面，我们根据差异表达提取出来的4913个基因在611个样品中的表达数据进行了主成份分析，根据主成份成分的结果，正常与肿瘤样本能很好地分成两类，这对样本的预测是很好的辅助。下面，我们将根据差异表达的结果来绘制热图，看能否找出预示肿瘤组织的标志分子。

> library(gplots) # 在R中，绘制热图的程序有很多，这里我们使用gplots::heatmap.2；
# 首先，我们使用TOP20 Genes来绘制热图
> result_select_FCOrder <- result_select[order(abs(result_select$log2FoldChange),decreasing=T),] # 差异表达基因，按FoldChange的绝对值从大到小排序
> result_select_top20 <- result_select_FCOrder[1:20,] # 选取表达差异最大的20个基因
> head(result_select_top20)[,1:3]
log2 fold change (MLE): group cancer vs normal 
 
DataFrame with 6 rows and 3 columns
                        baseMean    log2FoldChange             lfcSE
                       <numeric>         <numeric>         <numeric>
ENSG00000224490 1714.56503681026   8.4424354795262 0.238664257501869
ENSG00000074803  22193.654237283 -8.27581699230162 0.332982800750979
ENSG00000178776 886.398761602384  7.70696160260639 0.326179843061095
ENSG00000204362 354.175131758015  7.57039832393765 0.294534084998158
ENSG00000104327 2844.95684727121 -7.52918926148832 0.286386451386388
ENSG00000164434 11701.5196273703  7.44063941927556 0.324756004054849
> result_select_top20$log2FoldChange # 看一下变化倍数，18个基因在cancer中高表达，2个基因在normal中高表达
 [1]  8.442435 -8.275817  7.706962  7.570398 -7.529189  7.440639
 [7]  7.330764  7.310661  7.228141  7.097977  7.050444  6.976768
[13]  6.922567  6.909435  6.888393  6.851913  6.846070  6.616500
[19]  6.581890  6.495955
> dataTop20 <- data[rownames(result_select_top20),] # 把reads数加进去
> dim(dataTop20) # 20行,611列
[1]  20 611
> head(dataTop20)[,1:3]
                TCGA-CZ-5465-01A-01R-1503 TCGA-BP-4355-01A-01R-1289
ENSG00000224490                      8222                       864
ENSG00000074803                     19360                       100
ENSG00000178776                      1165                        97
ENSG00000204362                       201                       519
ENSG00000104327                        24                        35
ENSG00000164434                     17120                      6341
                TCGA-CZ-5451-01A-01R-1503
ENSG00000224490                      3772
ENSG00000074803                        10
ENSG00000178776                       423
ENSG00000204362                        20
ENSG00000104327                         6
ENSG00000164434                     21755
> heat <- data.matrix(dataTop20) # heatmap.2需要使用矩阵做输入，因此先转化为矩阵
> dim(heat)
[1]  20 611
> heat_scaled_t <- scale(t(heat)) # 行列转置，并对每个基因的表达进行标准化
> heat_scaled <- t(heat_scaled_t) # 标准化后再行列转置回来
> head(heat_scaled)[,1:3]
                TCGA-CZ-5465-01A-01R-1503 TCGA-BP-4355-01A-01R-1289
ENSG00000224490                2.62485660                -0.3517755
ENSG00000074803               -0.07491003                -0.2754319
ENSG00000178776                0.19061196                -0.4436383
ENSG00000204362               -0.17801482                 0.2129373
ENSG00000104327               -0.25582838                -0.2548436
ENSG00000164434                0.21122299                -0.2379535
                TCGA-CZ-5451-01A-01R-1503
ENSG00000224490                 0.8246374
ENSG00000074803                -0.2763689
ENSG00000178776                -0.2500376
ENSG00000204362                -0.4005379
ENSG00000104327                -0.2574398
ENSG00000164434                 0.4043701
> my_palette <- colorRampPalette(c("blue","white","red")) # 从低到高
> heat_scaled_limit2 <- ifelse(heat_scaled>2,2,heat_scaled) # 这里要做一个变换，即把标准化后大于2的值都等于2，是为了使图形色彩更加鲜明突出
> max(heat_scaled_limit2)
[1] 2
> colnames(heat_scaled_limit2) <- paste(group,1:length(group),sep="_") # 修改列名
> condition_colors <- unlist(lapply(colnames(heat_scaled_limit2),function(x){
+   if(grepl("cancer",x)) "#FFC0CB"
+   else if(grepl("normal",x)) "#808080"
+ })) # 定义分组颜色
> heatmap.2(heat_scaled_limit2,
+           trace="none",
+           density="none",
+           col=bluered(20),
+           cexRow = 1,
+           ColSideColors = condition_colors,
+           hclust=function(x) hclust(x,method="average"),
+           labCol = NA,
+           dendrogram = "none", # 去除dendrogram，但保留clustering
+           margins=c(2,10))
> legend(0,0.8,legend=c("cancer","normal"),fill=c("#FFC0CB","#808080"),cex=0.8)

下面，我们使用TOP50再来绘制热图

> result_select_top50 <- result_select_FCOrder[1:50,]
> result_select_top50$log2FoldChange
 [1]  8.442435 -8.275817  7.706962  7.570398 -7.529189  7.440639
 [7]  7.330764  7.310661  7.228141  7.097977  7.050444  6.976768
[13]  6.922567  6.909435  6.888393  6.851913  6.846070  6.616500
[19]  6.581890  6.495955  6.432374  6.425309  6.424296  6.394360
[25]  6.388963  6.377904  6.356020 -6.275527  6.256749  6.229892
[31]  6.196778  6.186268 -6.185622 -6.149709  6.116301  6.095080
[37]  6.042014  6.007039 -5.927111 -5.869138  5.853670  5.793502
[43]  5.756366  5.729011  5.714809  5.699259  5.695577 -5.694465
[49]  5.681664  5.677330
> dataTop50 <- data[rownames(result_select_top50),]
> heat <- data.matrix(dataTop50)
> heat_scaled_t <- scale(t(heat))
> heat_scaled <- t(heat_scaled_t)
> my_palette <- colorRampPalette(c("blue","white","red")) # from low to high
> heat_scaled_limit2 <- ifelse(heat_scaled>2,2,heat_scaled)
> condition_colors <- unlist(lapply(colnames(heat_scaled_limit2),function(x){
+   if(grepl("cancer",x)) "#FFC0CB"
+   else if(grepl("normal",x)) "#808080"
+ }))
> heatmap.2(heat_scaled_limit2,
+           trace="none",
+           density="none",
+           col=bluered(20),
+           cexRow = 1,
+           ColSideColors = condition_colors,
+           hclust=function(x) hclust(x,method="average"),
+           labCol = NA,
+           dendrogram = "none",
+           margins=c(2,10))
> legend(0,0.8,legend=c("cancer","normal"),fill=c("#FFC0CB","#808080"),cex=0.8)

如果将标准化后的值限定在3以内，看一下图像的变化，颜色会变淡一些

heat_scaled_limit3 <- ifelse(heat_scaled>3,3,heat_scaled)
colnames(heat_scaled_limit3) <- paste(group,1:length(group),sep="_")
condition_colors <- unlist(lapply(colnames(heat_scaled_limit3),function(x){
  if(grepl("cancer",x)) "#FFC0CB"
  else if(grepl("normal",x)) "#808080"
}))
heatmap.2(heat_scaled_limit3,
          trace="none",
          density="none",
          col=bluered(20),
          cexRow = 0.4,
          ColSideColors = condition_colors,
          hclust=function(x) hclust(x,method="average"),
          labCol = NA,
          dendrogram = "none",
          margins=c(2,10))
legend(0,0.8,legend=c("cancer","normal"),fill=c("#FFC0CB","#808080"),cex=0.8)

若限定在4以内，颜色进一步变淡

heat_scaled_limit4 <- ifelse(heat_scaled>4,4,heat_scaled)
colnames(heat_scaled_limit4) <- paste(group,1:length(group),sep="_")
condition_colors <- unlist(lapply(colnames(heat_scaled_limit4),function(x){
  if(grepl("cancer",x)) "#FFC0CB"
  else if(grepl("normal",x)) "#808080"
}))
heatmap.2(heat_scaled_limit4,
          trace="none",
          density="none",
          col=bluered(20),
          cexRow = 0.4,
          ColSideColors = condition_colors,
          hclust=function(x) hclust(x,method="average"),
          labCol = NA,
          dendrogram = "none",
          margins=c(2,10))
legend(0,0.8,legend=c("cancer","normal"),fill=c("#FFC0CB","#808080"),cex=0.8)

注意：对标准化后的值进行限定，基本不会对样本的聚类产生影响，但是会影响基因间的聚类，因此，产生的图像也会不同。