GEO学习笔记1

本文是学习过程,参考教程是《来完成你的生信作业,这是最有诚意的GEO数据库教程》,本文分析GSE32575,共36个配对样本,18个处理前,18个处理后。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
> setwd("D:/GEO")
> library(GEOquery)
> gset <- getGEO('GSE32575',destdir=".",getGPL=F) # 下载数据
> gset <- gset[[1]] # 获取表达矩阵与分组信息
> pdata <- pData(gset) # 分组信息
> dim(pdata)
[1] 48 37 # 48个样品,下面要删除前12个,仅用后36个
> group_list <- c(rep('before',18),rep('after',18))
> group_list <-factor(group_list) # 设定level
> group_list
[1] before before before before before before before before before
[10] before before before before before before before before before
[19] after after after after after after after after after
[28] after after after after after after after after after
Levels: after before
> group_list <- relevel(group_list, ref='before') # 限定level
> group_list
[1] before before before before before before before before before
[10] before before before before before before before before before
[19] after after after after after after after after after
[28] after after after after after after after after after
Levels: before after
> exprSet <- exprs(gset) # 获取表达矩阵
> boxplot(exprSet[,-c(1:12)],outline=FALSE,notch=T,col=group_list,las=2)

1
2
3
library(limma) # 使用limma进行Normalization
exprSet <- normalizeBetweenArrays(exprSet)
boxplot(exprSet[,-c(1:12)],outline=FALSE,notch=T,col=group_list,las=2)

1
2
3
4
5
6
7
8
9
10
# 对数据进行LOG转换,否则数据太过分散
# LOG2 TRANSFORM ----
ex <- exprSet
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprSet <- log2(ex)
print('log2 transform finished')}else{print('log2 transform not needed')}

转换之后,行名是探针名称,需要先转换为基因名称

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
> index <- gset@annotation
> index
[1] "GPL6102" # 探针平台名称
> platformMap <- data.table::fread("platformMap.txt") # 这个文件可在网上下载,对应着平台与R包之间的关系
> platformDB = paste0(platformMap$bioc_package[grep(index,platformMap$gpl)],'.db')
> platformDB
[1] "illuminaHumanv2.db" # 我们需要这个R包的名称,或者直接在TXT文件中查找
> BiocManager::install("illuminaHumanv2.db")
> library(illuminaHumanv2.db)
> probeset <- rownames(exprSet)
> SYMBOL <- annotate::lookUp(probeset,'illuminaHumanv2.db', 'SYMBOL')
> symbol <- as.vector(unlist(SYMBOL))
> probe2symbol <- data.frame(probeset,symbol,stringsAsFactors = F)
> dim(probe2symbol)
[1] 48701 2
> head(probe2symbol)
probeset symbol
1 ILMN_1343289 RPS9
2 ILMN_1343290 UBC
3 ILMN_1343291 EEF1A1
4 ILMN_1343292 TUBB2A
5 ILMN_1343293 TXN
6 ILMN_1343294 ACTB

其实,注释的过程中有些基因是没有名称的,会被标记为NA,另外也可能有重复的,即多个探针可能对应同一基因,因此,需要去掉重复与NA。

1
2
3
4
5
6
7
8
9
10
11
12
13
> exprSet <- exprSet %>% 
rownames_to_column(var='probeset') %>% # 把行名转化为第一列,原因是行名不可以有重复,不方便操作
inner_join(probe2symbol,by='probeset') %>% # 合并probe2symbol,即把symbol加到最后一列
select(-probeset) %>% # 去掉第一列,这是个过渡,用完就可以去掉
select(symbol,everything()) %>% # 重新排序,把symbol放在第一列
mutate(rowMean =rowMeans(.[grep('GSM', names(.))])) %>% # 计算每行除symbol外的平均值
filter(symbol != 'NA') %>% # 留下symbol不是NA的行,开始是48701行,筛选后剩下24459行
arrange(desc(rowMean)) %>% # 按行的平均值重新排列
distinct(symbol,.keep_all = T) %>% # 若symbol有重复,留下第一个,即rowMean最大的一个,筛选后剩下18954行;
select(-rowMean) %>% # 去掉rowMean列
column_to_rownames(var = 'symbol') # 把第一列转化为行名,操作结束
> dim(exprSet)
[1] 18954 36

到此,数据就准备好了,下面进行差异分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
> pairinfo <- factor(rep(1:18,2))
> pairinfo
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1 2 3 4
[23] 5 6 7 8 9 10 11 12 13 14 15 16 17 18
Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
> design <- model.matrix(~ pairinfo+group_list)
> fit <- lmFit(exprSet,design)
> fit <- eBayes(fit)
> allDiff_pair <- topTable(fit,adjust='BH',coef='group_listafter',number=Inf,p.value=0.05) # group_listafter是design中的一列
> dim(allDiff_pair)
[1] 7649 6 # 得到7649个差异基因
> head(allDiff_pair)
logFC AveExpr t P.Value adj.P.Val
R3HCC1 -0.5120536 9.488934 -21.00187 2.017132e-15 3.823273e-11
CHD9 0.9151318 9.456901 20.10559 4.773134e-15 4.523499e-11
PIK3CG 0.7508626 8.864623 18.75847 1.864086e-14 9.249356e-11
SCAND1 -0.6169805 10.540154 -18.69975 1.982044e-14 9.249356e-11
ASAH1 -0.7523775 9.364233 -18.50207 2.439948e-14 9.249356e-11
FAM76B 0.8384345 8.265115 17.90582 4.623183e-14 1.413263e-10
B
R3HCC1 25.17464
CHD9 24.38000
PIK3CG 23.10821
SCAND1 23.05052
ASAH1 22.85487
FAM76B 22.25094

作图验证一下,作图之前需要对数据格式进行调整,以适应ggplot2对输入数据的要求,即行是样品,列是基因

1
2
3
4
5
6
7
8
9
10
11
12
13
data_plot <- as.data.frame(t(exprSet))
data_plot <- data.frame(pairinfo=rep(1:18,2),
group=group_list,
data_plot,stringsAsFactors = F)
library(ggplot2)
ggplot(data_plot, aes(group,R3HCC1,fill=group)) +
geom_boxplot() +
geom_point(size=2, alpha=0.5) +
geom_line(aes(group=pairinfo), colour='black', linetype='11') +
xlab('') +
ylab(paste('Expression of ','R3HCC1'))+
theme_classic()+
theme(legend.position = 'none')

批量作图,挑差异最大的10个基因

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
> library(dplyr)
> library(tibble)
> allDiff_arrange <- allDiff_pair %>%
rownames_to_column(var='genesymbol') %>% # 行名转化为第一列
arrange(desc(abs(logFC))) # 对logFC的绝对值进行降序排列
> genes <- allDiff_arrange$genesymbol[1:10] # 排名前10的基因
> genes
[1] "CH25H" "HOXA5" "ID1" "ZBTB16" "IL1R2" "CA4" "SLC7A5"
[8] "DDIT4" "KLF9" "ACKR3"
> plotlist <- lapply(genes, function(x){
data =data.frame(data_plot[,c('pairinfo','group')],gene=data_plot[,x])
ggplot(data, aes(group,gene,fill=group)) +
geom_boxplot() +
geom_point(size=2, alpha=0.5) +
geom_line(aes(group=pairinfo), colour='black', linetype='11') +
xlab('') +
ylab(paste('Expression of ',x))+
theme_classic()+
theme(legend.position = 'none')
})
> library(cowplot)
> plot_grid(plotlist=plotlist, ncol=5,labels = LETTERS[1:10]) # 这个作图方法真是高明,感叹!

画热图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
library(pheatmap)
allDiff_pair2 <- topTable(fit,adjust='BH',coef='group_listafter',number=Inf,p.value=0.05,lfc = 1) # lfc=1,代表logFC绝对值大于1
heatdata <- exprSet[rownames(allDiff_pair2),]
annotation_col<- data.frame(group_list)
rownames(annotation_col) <- colnames(heatdata)
pheatmap(heatdata,
cluster_rows = TRUE,
cluster_cols = TRUE,
annotation_col =annotation_col,
annotation_legend=TRUE,
show_rownames = T,
show_colnames = T,
scale = 'row', # scale一定选行,即在基因内标准化
color =colorRampPalette(c('blue', 'white','red'))(100))

绘制火山图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
> library(ggplot2)
> library(ggrepel)
> library(dplyr)
> data <- topTable(fit,adjust='BH',coef='group_listafter',number=Inf)
> data$significant <- as.factor(data$adj.P.Val<0.05 & abs(data$logFC) > 0.6) #以logFC=1.5为界
> data$gene <- rownames(data)
> ggplot(data=data, aes(x=logFC, y =-log10(adj.P.Val),color=significant)) +
+ geom_point(alpha=0.8, size=1.2,col='black')+
+ geom_point(data=subset(data, logFC > 0.6),alpha=0.8, size=1.2,col='red')+
+ geom_point(data=subset(data, logFC < -0.6),alpha=0.6, size=1.2,col='blue')+
+ labs(x='log2 (fold change)',y='-log10 (adj.P.Val)')+
+ theme(plot.title = element_text(hjust = 0.4))+
+ geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
+ geom_vline(xintercept = c(0.6,-0.6),lty=4,lwd=0.6,alpha=0.8)+
+ theme_bw()+
+ theme(panel.border = element_blank(),
+ panel.grid.major = element_blank(),
+ panel.grid.minor = element_blank(),
+ axis.line = element_line(colour = 'black')) +
+ geom_point(data=subset(data, abs(logFC) > 1),alpha=0.8, size=3,col='green')+
+ geom_text_repel(data=subset(data, abs(logFC) > 1),
+ aes(label=gene),col='black',alpha = 0.8)

富集分析

1
2
3
4
5
6
7
> library(clusterProfiler)
> gene <- rownames(allDiff_pair) # 只需要基因名
> gene <- bitr(gene, fromType='SYMBOL', toType='ENTREZID', OrgDb='org.Hs.eg.db')
> de <- gene$ENTREZID
> go <- enrichGO(gene = de, OrgDb = 'org.Hs.eg.db', ont='all')
> library(ggplot2)
> dotplot(go, split='ONTOLOGY') +facet_grid(ONTOLOGY~., scale='free')

1
2
3
4
> EGG <- enrichKEGG(gene= gene$ENTREZID,
organism = 'hsa',
pvalueCutoff = 0.05)
> dotplot(EGG)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
> test <- data.frame(EGG)
> head(test)[,1:7]
ID Description GeneRatio BgRatio
hsa04120 hsa04120 Ubiquitin mediated proteolysis 88/3122 140/8048
hsa04218 hsa04218 Cellular senescence 94/3122 156/8048
hsa04210 hsa04210 Apoptosis 84/3122 136/8048
hsa04110 hsa04110 Cell cycle 76/3122 124/8048
hsa05220 hsa05220 Chronic myeloid leukemia 51/3122 76/8048
hsa05131 hsa05131 Shigellosis 130/3122 242/8048
pvalue p.adjust qvalue
hsa04120 5.535090e-09 1.787834e-06 1.229373e-06
hsa04218 3.626260e-08 4.287401e-06 2.948155e-06
hsa04210 3.982106e-08 4.287401e-06 2.948155e-06
hsa04110 2.770947e-07 2.237540e-05 1.538605e-05
hsa05220 4.909519e-07 3.171549e-05 2.180860e-05
hsa05131 1.268222e-06 6.827259e-05 4.694645e-05
> browseKEGG(EGG, 'hsa04218')

  • 本文作者:括囊无誉
  • 本文链接: GEO/GEO01/
  • 版权声明: 本博客所有文章均为原创作品,转载请注明出处!
------ 本文结束 ------
坚持原创文章分享,您的支持将鼓励我继续创作!