TCGA学习笔记4-样品分类

前面,我们从metadata中提取了样品的信息,并替换了表达数据中的列名,我们还需要根据列名来将样品分类,并且要对基因的编号进行处理

首先,处理基因的编号

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
> genename <- rownames(data) # 提取data中的60482个基因名,赋值给genename
> head(genename)
[1] "ENSG00000000005.5" "ENSG00000000419.11" "ENSG00000000457.12" "ENSG00000000460.15"
[5] "ENSG00000000938.11" "ENSG00000000971.14"
> length(genename)
[1] 60482

> gene_no_point <- gsub("\\.(\\.?\\d*)","",genename) # 去除小数点后的内容,方便后面作注释
> head(gene_no_point)
[1] "ENSG00000000005" "ENSG00000000419" "ENSG00000000457" "ENSG00000000460"
[5] "ENSG00000000938" "ENSG00000000971"
> length(gene_no_point)
[1] 60482

> rownames(data) <- gene_no_point # 将data的行名替换回来
> head(data)[1:4]
TCGA-CZ-5465-01A-01R-1503 TCGA-BP-4355-01A-01R-1289
ENSG00000000005 33 29
ENSG00000000419 2501 1427
ENSG00000000457 1167 1081
ENSG00000000460 348 247
ENSG00000000938 899 1211
ENSG00000000971 1460 4505
TCGA-CZ-5451-01A-01R-1503 TCGA-B0-5081-01A-01R-1334
ENSG00000000005 83 77
ENSG00000000419 967 1300
ENSG00000000457 574 796
ENSG00000000460 162 321
ENSG00000000938 1100 2065
ENSG00000000971 936 6501

下面,对样本进行分类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
> sampleName <- colnames(data) # 提取data的列名,其实就是样本名,来源是metadata
> head(sampleName)
[1] "TCGA-CZ-5465-01A-01R-1503" "TCGA-BP-4355-01A-01R-1289" "TCGA-CZ-5451-01A-01R-1503"
[4] "TCGA-B0-5081-01A-01R-1334" "TCGA-CZ-5454-11A-01R-1503" "TCGA-B0-5697-01A-11R-1541"
> sampleGroup <- substr(sampleName,14,15) # 提取14-15字符
> table(sampleGroup)
sampleGroup # 看一下这个分类,538个01(Primary Solid Tumor),72个11(Solid Tissue Normal),1个05(Additional - New Primary)
01 05 11
538 1 72
> sampleGroup
[1] "01" "01" "01" "01" "11" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01"
[17] "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01"
[33] "01" "01" "01" "01" "01" "01" "01" "01" "11" "01" "01" "11" "11" "11" "01" "01"
[49] "01" "11" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01" "11" "01" "01" "01"
[65] "01" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01" "01" "01"
[81] "11" "01" "11" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "11"
[97] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "11" "01"
[113] "01" "01" "01" "11" "01" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "11"
[129] "01" "11" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01"
[145] "01" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01" "01" "01" "11" "01"
[161] "01" "01" "01" "11" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01" "01"
[177] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01" "01"
[193] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01"
[209] "01" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01" "01" "01"
[225] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01"
[241] "01" "01" "01" "01" "01" "01" "11" "01" "11" "01" "01" "01" "01" "01" "01" "01"
[257] "01" "01" "01" "01" "11" "01" "01" "11" "01" "01" "01" "01" "01" "01" "01" "01"
[273] "01" "01" "01" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01"
[289] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01"
[305] "11" "01" "01" "01" "01" "11" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01"
[321] "01" "01" "01" "11" "11" "01" "01" "01" "11" "01" "11" "11" "01" "01" "01" "11"
[337] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "11" "01" "01"
[353] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "11" "01"
[369] "11" "11" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01"
[385] "11" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "05" "01"
[401] "01" "01" "01" "11" "01" "01" "01" "01" "01" "11" "11" "11" "01" "01" "01" "01"
[417] "01" "01" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01" "01"
[433] "01" "01" "01" "01" "01" "01" "01" "01" "11" "01" "11" "01" "01" "01" "01" "01"
[449] "01" "11" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01" "01"
[465] "11" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "11" "11"
[481] "01" "01" "01" "01" "01" "11" "11" "11" "01" "01" "01" "01" "01" "01" "01" "01"
[497] "01" "11" "01" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01" "01" "01"
[513] "01" "01" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01" "01" "01" "01" "01"
[529] "01" "01" "01" "01" "01" "01" "01" "11" "01" "01" "01" "11" "01" "01" "01" "01"
[545] "01" "01" "01" "01" "01" "01" "11" "11" "01" "11" "01" "01" "01" "01" "01" "01"
[561] "11" "01" "01" "01" "01" "01" "01" "01" "11" "11" "01" "01" "01" "01" "01" "01"
[577] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01"
[593] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01"
[609] "01" "01" "01"

> group <- ifelse(as.numeric(sampleGroup)<10,1,0) # 根据TCGA的说明,0-9代表肿瘤,因此,将小于10的样品定义为肿瘤
> table(group)
group # 0代表正常样本,72个;1代表肿瘤,539个
0 1
72 539
> group
[1] 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
[43] 1 0 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1
[85] 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1
[127] 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1
[169] 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[211] 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1
[253] 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
[295] 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 0 1 1 1 0
[337] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1
[379] 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1
[421] 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1
[463] 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0
[505] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1
[547] 1 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[589] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

> group <- factor(group,levels=c(0,1),labels=c("normal","cancer")) # 加上标签
> table(group)
group
normal cancer
72 539
> group
[1] cancer cancer cancer cancer normal cancer cancer cancer cancer normal cancer cancer
[13] cancer cancer cancer cancer cancer cancer cancer cancer normal cancer cancer cancer
[25] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[37] cancer cancer cancer cancer normal cancer cancer normal normal normal cancer cancer
[49] cancer normal cancer cancer cancer cancer cancer cancer normal cancer cancer cancer
[61] normal cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer normal
[73] cancer cancer cancer cancer cancer cancer cancer cancer normal cancer normal cancer
[85] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer normal
[97] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[109] cancer cancer normal cancer cancer cancer cancer normal cancer cancer cancer cancer
[121] cancer normal cancer cancer cancer cancer cancer normal cancer normal cancer cancer
[133] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[145] cancer cancer cancer cancer cancer normal cancer cancer cancer cancer cancer cancer
[157] cancer cancer normal cancer cancer cancer cancer normal cancer cancer cancer cancer
[169] normal cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[181] cancer cancer cancer cancer cancer cancer cancer normal cancer cancer cancer cancer
[193] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[205] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer normal
[217] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[229] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[241] cancer cancer cancer cancer cancer cancer normal cancer normal cancer cancer cancer
[253] cancer cancer cancer cancer cancer cancer cancer cancer normal cancer cancer normal
[265] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[277] cancer cancer cancer cancer cancer normal cancer cancer cancer cancer cancer cancer
[289] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer normal cancer
[301] cancer cancer cancer cancer normal cancer cancer cancer cancer normal cancer cancer
[313] cancer normal cancer cancer cancer cancer cancer cancer cancer cancer cancer normal
[325] normal cancer cancer cancer normal cancer normal normal cancer cancer cancer normal
[337] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[349] cancer normal cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[361] cancer cancer cancer cancer cancer cancer normal cancer normal normal cancer cancer
[373] cancer cancer cancer cancer cancer cancer cancer cancer normal cancer cancer cancer
[385] normal cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[397] cancer cancer cancer cancer cancer cancer cancer normal cancer cancer cancer cancer
[409] cancer normal normal normal cancer cancer cancer cancer cancer cancer cancer cancer
[421] cancer cancer cancer cancer normal cancer cancer cancer cancer cancer cancer cancer
[433] cancer cancer cancer cancer cancer cancer cancer cancer normal cancer normal cancer
[445] cancer cancer cancer cancer cancer normal cancer cancer cancer cancer cancer cancer
[457] normal cancer cancer cancer cancer cancer cancer cancer normal cancer cancer cancer
[469] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer normal normal
[481] cancer cancer cancer cancer cancer normal normal normal cancer cancer cancer cancer
[493] cancer cancer cancer cancer cancer normal cancer cancer cancer cancer cancer normal
[505] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[517] cancer cancer cancer cancer normal cancer cancer cancer cancer cancer cancer cancer
[529] cancer cancer cancer cancer cancer cancer cancer normal cancer cancer cancer normal
[541] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer normal normal
[553] cancer normal cancer cancer cancer cancer cancer cancer normal cancer cancer cancer
[565] cancer cancer cancer cancer normal normal cancer cancer cancer cancer cancer cancer
[577] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[589] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
[601] cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer cancer
Levels: normal cancer

> colData <- data.frame(group=group)
> head(colData)
group
1 cancer
2 cancer
3 cancer
4 cancer
5 normal
6 cancer
> nrow(colData)
[1] 611

> rownames(colData) <- colnames(data)
> head(colData)
group
TCGA-CZ-5465-01A-01R-1503 cancer
TCGA-BP-4355-01A-01R-1289 cancer
TCGA-CZ-5451-01A-01R-1503 cancer
TCGA-B0-5081-01A-01R-1334 cancer
TCGA-CZ-5454-11A-01R-1503 normal
TCGA-B0-5697-01A-11R-1541 cancer

> data_select <- data[rowSums(data>20)>ncol(data)/2,] # 去掉表达太低的行,具体数值可自定义
> dim(data)
[1] 60482 611
> dim(data_select)
[1] 18129 611 # 从开始的60482个基因,到最后的18129个基因

至此,样本的分类及处理就完成了,下一步就可以分析基因的表达差异了。

  • 本文作者:括囊无誉
  • 本文链接: TCGA/TCGA4SampleGroup/
  • 版权声明: 本博客所有文章均为原创作品,转载请注明出处!
------ 本文结束 ------
坚持原创文章分享,您的支持将鼓励我继续创作!