GEO学习笔记2

分析GSE100666，6个配对样本，平台是GPL16951，分析过程如下：

setwd("D:/GEO")
library(GEOquery)
gset <- getGEO('GSE100666',destdir=".",getGPL=F)
gset <- gset[[1]]
pdata <- pData(gset)
dim(pdata)
group_list <- c(rep('tumor',3),rep('normal',3))
group_list <-factor(group_list)
group_list <- relevel(group_list, ref='tumor')
group_list
exprSet <- exprs(gset)
boxplot(exprSet,outline=FALSE,notch=T,col=group_list,las=2)

# NORMALIZATION ----
library(limma)
exprSet <- normalizeBetweenArrays(exprSet)
boxplot(exprSet,outline=FALSE,notch=T,col=group_list,las=2)

# LOG2 TRANSFORM ----
ex <- exprSet
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
  (qx[6]-qx[1] > 50 && qx[2] > 0) ||
  (qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprSet <- log2(ex) }
dim(exprSet)
# ANNOTATION ----
gpl<- getGEO('GPL16951',destdir = './')
gpl<- getGEO(filename = './GPL16951.soft')
GPL16951<- GEOquery::Table(gpl)
probeset <- rownames(exprSet)
probe2symbol <- data.frame(probeset,symbol=GPL16951$Gene_symbol,stringsAsFactors = F)
dim(probe2symbol)
head(probe2symbol)
# REMOVE NA ----
library(dplyr)
library(tibble)
exprSet <- as.data.frame(exprSet)
exprSet <- exprSet %>% 
  rownames_to_column(var='probeset') %>% 
  inner_join(probe2symbol,by='probeset') %>% 
  select(-probeset) %>% 
  select(symbol,everything()) %>% 
  mutate(rowMean =rowMeans(.[grep('GSM', names(.))])) %>% 
  filter(symbol != 'NA') %>% 
  arrange(desc(rowMean)) %>% 
  distinct(symbol,.keep_all = T) %>% 
  select(-rowMean) %>% 
  column_to_rownames(var = 'symbol')
exprSet <- na.omit(exprSet)
dim(exprSet)
# DIFF EXPRESS ----
pairinfo <- factor(rep(1:3,2))
design <- model.matrix(~ pairinfo+group_list)
fit <- lmFit(exprSet,design)
fit <- eBayes(fit) 
allDiff_pair <- topTable(fit,adjust='BH',coef='group_listnormal',number=Inf,p.value=0.05)
# PLOT ----
data_plot <- as.data.frame(t(exprSet))
data_plot <- data.frame(pairinfo=rep(1:3,2),
                       group=group_list,
                       data_plot,stringsAsFactors = F)
library(ggplot2)
ggplot(data_plot, aes(group,ATP6V1G3,fill=group)) +
  geom_boxplot() +
  geom_point(size=2, alpha=0.5) +
  geom_line(aes(group=pairinfo), colour='black', linetype='11') +
  xlab('') +
  ylab(paste('Expression of ','ATP6V1G3'))+
  theme_classic()+
  theme(legend.position = 'none')

# COMBINE PLOT ----
library(dplyr)
library(tibble)
allDiff_arrange <- allDiff_pair %>% 
  rownames_to_column(var='genesymbol') %>% 
  arrange(desc(abs(logFC)))
genes <- allDiff_arrange$genesymbol[1:10]
genes
plotlist <- lapply(genes, function(x){
  data =data.frame(data_plot[,c('pairinfo','group')],gene=data_plot[,x])
  ggplot(data, aes(group,gene,fill=group)) +
    geom_boxplot() +
    geom_point(size=2, alpha=0.5) +
    geom_line(aes(group=pairinfo), colour='black', linetype='11') +
    xlab('') +
    ylab(paste('Expression of ',x))+
    theme_classic()+
    theme(legend.position = 'none')
})

library(cowplot)
plot_grid(plotlist=plotlist, ncol=5,labels = LETTERS[1:10])

# HEATMAP ----
library(pheatmap)
allDiff_pair2 <- topTable(fit,adjust='BH',coef='group_listnormal',number=Inf,p.value=0.05,lfc = 1) 
heatdata <- exprSet[rownames(allDiff_pair2),]
#heat <- na.omit(heatdata)
annotation_col <- data.frame(group_list)
rownames(annotation_col) <- colnames(heatdata)
pheatmap(heatdata, 
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         annotation_col =annotation_col, 
         annotation_legend=TRUE, 
         show_rownames = F,
         show_colnames = T,
         scale = 'row', 
         color =colorRampPalette(c('blue', 'white','red'))(100))

# VOLCANO PLOT ----
library(ggplot2)
library(ggrepel)
library(dplyr)

data <- topTable(fit,adjust='BH',coef='group_listnormal',number=Inf) 
data$significant <- as.factor(data$adj.P.Val<0.05 & abs(data$logFC) > 1)
data$gene <- rownames(data)
 
ggplot(data=data, aes(x=logFC, y =-log10(adj.P.Val),color=significant)) +
  geom_point(alpha=0.8, size=1.2,col='black')+
  geom_point(data=subset(data, logFC > 1),alpha=0.8, size=1.2,col='red')+
  geom_point(data=subset(data, logFC < -1),alpha=0.6, size=1.2,col='blue')+
  labs(x='log2 (fold change)',y='-log10 (adj.P.Val)')+
  theme(plot.title = element_text(hjust = 0.4))+
  geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
  geom_vline(xintercept = c(1,-1),lty=4,lwd=0.6,alpha=0.8)+
  theme_bw()+
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),   
        axis.line = element_line(colour = 'black'))

# ENRICH 
library(clusterProfiler)
gene <- rownames(allDiff_pair)
gene <- bitr(gene, fromType='SYMBOL', toType='ENTREZID', OrgDb='org.Hs.eg.db')
de <- gene$ENTREZID
go <- enrichGO(gene = de, OrgDb = 'org.Hs.eg.db', ont='all')
library(ggplot2)
dotplot(go, split='ONTOLOGY') +facet_grid(ONTOLOGY~., scale='free')

EGG <- enrichKEGG(gene= gene$ENTREZID,
                  organism     = 'hsa',
                  pvalueCutoff = 0.05)
dotplot(EGG)

1
2
3

test <- data.frame(EGG)
head(test)[,1:7]
browseKEGG(EGG, 'hsa04151')