GEO学习笔记2

分析GSE100666,6个配对样本,平台是GPL16951,分析过程如下:

1
2
3
4
5
6
7
8
9
10
11
12
setwd("D:/GEO")
library(GEOquery)
gset <- getGEO('GSE100666',destdir=".",getGPL=F)
gset <- gset[[1]]
pdata <- pData(gset)
dim(pdata)
group_list <- c(rep('tumor',3),rep('normal',3))
group_list <-factor(group_list)
group_list <- relevel(group_list, ref='tumor')
group_list
exprSet <- exprs(gset)
boxplot(exprSet,outline=FALSE,notch=T,col=group_list,las=2)

1
2
3
4
# NORMALIZATION ----
library(limma)
exprSet <- normalizeBetweenArrays(exprSet)
boxplot(exprSet,outline=FALSE,notch=T,col=group_list,las=2)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# LOG2 TRANSFORM ----
ex <- exprSet
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprSet <- log2(ex) }
dim(exprSet)
# ANNOTATION ----
gpl<- getGEO('GPL16951',destdir = './')
gpl<- getGEO(filename = './GPL16951.soft')
GPL16951<- GEOquery::Table(gpl)
probeset <- rownames(exprSet)
probe2symbol <- data.frame(probeset,symbol=GPL16951$Gene_symbol,stringsAsFactors = F)
dim(probe2symbol)
head(probe2symbol)
# REMOVE NA ----
library(dplyr)
library(tibble)
exprSet <- as.data.frame(exprSet)
exprSet <- exprSet %>%
rownames_to_column(var='probeset') %>%
inner_join(probe2symbol,by='probeset') %>%
select(-probeset) %>%
select(symbol,everything()) %>%
mutate(rowMean =rowMeans(.[grep('GSM', names(.))])) %>%
filter(symbol != 'NA') %>%
arrange(desc(rowMean)) %>%
distinct(symbol,.keep_all = T) %>%
select(-rowMean) %>%
column_to_rownames(var = 'symbol')
exprSet <- na.omit(exprSet)
dim(exprSet)
# DIFF EXPRESS ----
pairinfo <- factor(rep(1:3,2))
design <- model.matrix(~ pairinfo+group_list)
fit <- lmFit(exprSet,design)
fit <- eBayes(fit)
allDiff_pair <- topTable(fit,adjust='BH',coef='group_listnormal',number=Inf,p.value=0.05)
# PLOT ----
data_plot <- as.data.frame(t(exprSet))
data_plot <- data.frame(pairinfo=rep(1:3,2),
group=group_list,
data_plot,stringsAsFactors = F)
library(ggplot2)
ggplot(data_plot, aes(group,ATP6V1G3,fill=group)) +
geom_boxplot() +
geom_point(size=2, alpha=0.5) +
geom_line(aes(group=pairinfo), colour='black', linetype='11') +
xlab('') +
ylab(paste('Expression of ','ATP6V1G3'))+
theme_classic()+
theme(legend.position = 'none')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# COMBINE PLOT ----
library(dplyr)
library(tibble)
allDiff_arrange <- allDiff_pair %>%
rownames_to_column(var='genesymbol') %>%
arrange(desc(abs(logFC)))
genes <- allDiff_arrange$genesymbol[1:10]
genes
plotlist <- lapply(genes, function(x){
data =data.frame(data_plot[,c('pairinfo','group')],gene=data_plot[,x])
ggplot(data, aes(group,gene,fill=group)) +
geom_boxplot() +
geom_point(size=2, alpha=0.5) +
geom_line(aes(group=pairinfo), colour='black', linetype='11') +
xlab('') +
ylab(paste('Expression of ',x))+
theme_classic()+
theme(legend.position = 'none')
})

library(cowplot)
plot_grid(plotlist=plotlist, ncol=5,labels = LETTERS[1:10])

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# HEATMAP ----
library(pheatmap)
allDiff_pair2 <- topTable(fit,adjust='BH',coef='group_listnormal',number=Inf,p.value=0.05,lfc = 1)
heatdata <- exprSet[rownames(allDiff_pair2),]
#heat <- na.omit(heatdata)
annotation_col <- data.frame(group_list)
rownames(annotation_col) <- colnames(heatdata)
pheatmap(heatdata,
cluster_rows = TRUE,
cluster_cols = TRUE,
annotation_col =annotation_col,
annotation_legend=TRUE,
show_rownames = F,
show_colnames = T,
scale = 'row',
color =colorRampPalette(c('blue', 'white','red'))(100))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# VOLCANO PLOT ----
library(ggplot2)
library(ggrepel)
library(dplyr)

data <- topTable(fit,adjust='BH',coef='group_listnormal',number=Inf)
data$significant <- as.factor(data$adj.P.Val<0.05 & abs(data$logFC) > 1)
data$gene <- rownames(data)

ggplot(data=data, aes(x=logFC, y =-log10(adj.P.Val),color=significant)) +
geom_point(alpha=0.8, size=1.2,col='black')+
geom_point(data=subset(data, logFC > 1),alpha=0.8, size=1.2,col='red')+
geom_point(data=subset(data, logFC < -1),alpha=0.6, size=1.2,col='blue')+
labs(x='log2 (fold change)',y='-log10 (adj.P.Val)')+
theme(plot.title = element_text(hjust = 0.4))+
geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
geom_vline(xintercept = c(1,-1),lty=4,lwd=0.6,alpha=0.8)+
theme_bw()+
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = 'black'))

1
2
3
4
5
6
7
8
# ENRICH 
library(clusterProfiler)
gene <- rownames(allDiff_pair)
gene <- bitr(gene, fromType='SYMBOL', toType='ENTREZID', OrgDb='org.Hs.eg.db')
de <- gene$ENTREZID
go <- enrichGO(gene = de, OrgDb = 'org.Hs.eg.db', ont='all')
library(ggplot2)
dotplot(go, split='ONTOLOGY') +facet_grid(ONTOLOGY~., scale='free')

1
2
3
4
EGG <- enrichKEGG(gene= gene$ENTREZID,
organism = 'hsa',
pvalueCutoff = 0.05)
dotplot(EGG)

1
2
3
test <- data.frame(EGG)
head(test)[,1:7]
browseKEGG(EGG, 'hsa04151')

  • 本文作者:括囊无誉
  • 本文链接: GEO/GEO02/
  • 版权声明: 本博客所有文章均为原创作品,转载请注明出处!
------ 本文结束 ------
坚持原创文章分享,您的支持将鼓励我继续创作!