肿瘤基因组分析教程:一、基础命令行操作

肿瘤基因组测序的结果存储在VCF格式中,我们先对VCF进行一些基本的操作,以获得对其直观的印象,下载一个VCF文件,来看一下。参考教程Introduction to Command Line

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
$ ls -lh snp.vcf
-rwxrwxrwx 1 eric eric 11M Nov 18 22:12 snp.vcf # 详细信息

$ wc -l snp.vcf
58314 snp.vcf # 行数,含注释行

$ head -5 snp.vcf
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
# 查看前5行

# 由于VCF头部有大量的注释信息,因此,在实际的分析中,当我们需要查看主体内容时,需要把头部的注释信息过滤掉,可以使用grep+pipes "|"来实现,
$ grep -v "^#" snp.vcf | head

chr1 16495 . G C 36.65 . AC=1;AF=0.5;AN=2;BaseQRankSum=-0.967;CNN_1D=-4.292;DP=3;ExcessHet=3.0103;FS=0;MLEAC=1;MLEAF=0.5;MQ=22;MQRankSum=0;QD=12.22;ReadPosRankSum=0.967;SOR=1.179 GT:AD:DP:GQ:PL 0/1:1,2:3:18:44,0,18
chr1 17614 . G A 54.64 . AC=1;AF=0.5;AN=2;BaseQRankSum=0;CNN_1D=-0.693;DP=5;ExcessHet=3.0103;FS=0;MLEAC=1;MLEAF=0.5;MQ=47.88;MQRankSum=-1.645;QD=10.93;ReadPosRankSum=1.04;SOR=1.179 GT:AD:DP:GQ:PL 0/1:2,3:5:35:62,0,35
chr1 19322 . C T 44.64 . AC=1;AF=0.5;AN=2;BaseQRankSum=0.703;CNN_1D=-3.724;DP=8;ExcessHet=3.0103;FS=0;MLEAC=1;MLEAF=0.5;MQ=38.66;MQRankSum=-0.956;QD=5.58;ReadPosRankSum=0.414;SOR=0.818 GT:AD:DP:GQ:PGT:PID:PL:PS 0|1:5,3:8:52:0|1:19322_C_T:52,0,140:19322
chr1 19342 . G A 58.64 . AC=1;AF=0.5;AN=2;BaseQRankSum=1.65;CNN_1D=-3.127;DP=5;ExcessHet=3.0103;FS=0;MLEAC=1;MLEAF=0.5;MQ=41.16;MQRankSum=-0.524;QD=11.73;ReadPosRankSum=-1.036;SOR=1.609 GT:AD:DP:GQ:PGT:PID:PL:PS 0|1:3,2:5:66:1|0:19322_C_T:66,0,116:19322
chr1 29443 . A G 41.32 . AC=2;AF=1;AN=2;CNN_1D=-3.532;DP=3;ExcessHet=3.0103;FS=0;MLEAC=1;MLEAF=0.5;MQ=22;QD=20.66;SOR=2.303 GT:AD:DP:GQ:PL 1/1:0,2:2:6:53,6,0
chr1 63268 . T C 39.64 . AC=1;AF=0.5;AN=2;BaseQRankSum=0.309;CNN_1D=-2.694;DP=11;ExcessHet=3.0103;FS=0;MLEAC=1;MLEAF=0.5;MQ=55.09;MQRankSum=-2.744;QD=3.6;ReadPosRankSum=-0.381;SOR=1.179 GT:AD:DP:GQ:PL 0/1:8,3:11:47:47,0,169
chr1 63643 . A G 31.64 . AC=1;AF=0.5;AN=2;BaseQRankSum=0.175;CNN_1D=-4.388;DP=14;ExcessHet=3.0103;FS=0;MLEAC=1;MLEAF=0.5;MQ=39.49;MQRankSum=-0.082;QD=2.26;ReadPosRankSum=0.468;SOR=1.093 GT:AD:DP:GQ:PL 0/1:11,3:14:39:39,0,290
chr1 69511 . A G 191.97 . AC=2;AF=1;AN=2;CNN_1D=2.198;DP=6;ExcessHet=3.0103;FS=0;MLEAC=2;MLEAF=1;MQ=43.45;QD=32;SOR=0.693 GT:AD:DP:GQ:PL 1/1:0,6:6:18:206,18,0
chr1 129285 . G A 131.64 . AC=1;AF=0.5;AN=2;BaseQRankSum=-0.074;CNN_1D=-4.923;DP=28;ExcessHet=3.0103;FS=1.617;MLEAC=1;MLEAF=0.5;MQ=35.11;MQRankSum=-1.595;QD=4.7;ReadPosRankSum=0.025;SOR=0.694 GT:AD:DP:GQ:PL 0/1:19,9:28:99:139,0,390
chr1 129315 . A G 44.64 . AC=1;AF=0.5;AN=2;BaseQRankSum=0;CNN_1D=-5.233;DP=24;ExcessHet=3.0103;FS=2.114;MLEAC=1;MLEAF=0.5;MQ=37.02;MQRankSum=-1.323;QD=1.86;ReadPosRankSum=1.92;SOR=0.768 GT:AD:DP:GQ:PL 0/1:19,5:24:52:52,0,415

# 因此,我们可以轻松计算VCF文件主体内容有多少行,如,
$ grep -v "^#" snp.vcf | wc -l
# 57817

# 若只需要看其中一列,则可以结合cut命令,
$ grep -v "^#" snp.vcf | cut -f 1 | head # 第一列
chr1
chr1
chr1
chr1
chr1
chr1
chr1
chr1
chr1
chr1

$ grep -v "^#" snp.vcf | cut -f 2 | head # 第二列
16495
17614
19322
19342
29443
63268
63643
69511
129285
129315

# 首列是染色体编号,我们知道,人类有22对常染色体(chr1:chr22)和1对性染色体(XX for female, and XY for male),如果我们VCF文件中都包含哪些染色体,可以通过sort命令来实现

sort -u (-u, --unique, with -c, check for strict ordering; without -c, output only the first of an equal run) # sort -u 是指去掉重复,每个ID只列举一次

$ grep -v "^#" snp.vcf | cut -f 1 | sort -u
chr1
chr10
chr11
chr11_JH159137v1_alt
chr11_KI270831v1_alt
chr11_KI270832v1_alt
chr11_KI270902v1_alt
chr11_KI270903v1_alt
chr11_KI270927v1_alt
chr12
chr12_KI270835v1_alt
chr13
chr13_KI270843v1_alt
chr14
chr14_GL000009v2_random
chr14_GL000194v1_random
chr14_GL000225v1_random
chr14_KI270722v1_random
chr14_KI270723v1_random
chr14_KI270724v1_random
chr14_KI270725v1_random
chr14_KI270726v1_random
chr14_KI270845v1_alt
chr14_KI270846v1_alt
chr15
chr15_KI270848v1_alt
chr15_KI270850v1_alt
chr15_KI270851v1_alt
chr15_KI270905v1_alt
chr16
chr16_KI270728v1_random
chr16_KI270853v1_alt
chr17
chr17_GL000205v2_random
chr17_GL000258v2_alt
chr17_GL383563v3_alt
chr17_JH159146v1_alt
chr17_JH159147v1_alt
chr17_JH159148v1_alt
chr17_KI270729v1_random
chr17_KI270730v1_random
chr17_KI270857v1_alt
chr17_KI270860v1_alt
chr17_KI270908v1_alt
chr17_KI270909v1_alt
chr18
chr18_GL383567v1_alt
chr19
chr19_GL949753v2_alt
chr1_KI270706v1_random
chr1_KI270709v1_random
chr1_KI270710v1_random
chr1_KI270711v1_random
chr1_KI270712v1_random
chr1_KI270713v1_random
chr1_KI270714v1_random
chr1_KI270766v1_alt
chr2
chr20
chr21
chr22
chr22_KI270731v1_random
chr22_KI270732v1_random
chr22_KI270733v1_random
chr22_KI270734v1_random
chr22_KI270735v1_random
chr22_KI270736v1_random
chr22_KI270875v1_alt
chr22_KI270878v1_alt
chr22_KI270879v1_alt
chr22_KI270928v1_alt
chr2_KI270894v1_alt
chr3
chr3_GL000221v1_random
chr3_KI270779v1_alt
chr3_KI270924v1_alt
chr4
chr4_GL000008v2_random
chr5
chr5_GL000208v1_random
chr5_GL339449v2_alt
chr5_KI270792v1_alt
chr5_KI270898v1_alt
chr6
chr6_GL000251v2_alt
chr6_GL000252v2_alt
chr6_GL000253v2_alt
chr6_GL000254v2_alt
chr6_GL000255v2_alt
chr6_GL000256v2_alt
chr7
chr7_KI270803v1_alt
chr7_KI270809v1_alt
chr8
chr8_KI270813v1_alt
chr8_KI270821v1_alt
chr9
chr9_KI270718v1_random
chr9_KI270719v1_random
chr9_KI270720v1_random
chrM
chrUn_GL000214v1
chrUn_GL000216v2
chrUn_GL000218v1
chrUn_GL000219v1
chrUn_GL000220v1
chrUn_GL000224v1
chrUn_KI270322v1
chrUn_KI270330v1
chrUn_KI270435v1
chrUn_KI270438v1
chrUn_KI270442v1
chrUn_KI270466v1
chrUn_KI270467v1
chrUn_KI270507v1
chrUn_KI270516v1
chrUn_KI270519v1
chrUn_KI270538v1
chrUn_KI270742v1
chrUn_KI270743v1
chrUn_KI270744v1
chrUn_KI270745v1
chrUn_KI270746v1
chrUn_KI270747v1
chrUn_KI270749v1
chrUn_KI270750v1
chrUn_KI270754v1
chrX
chrX_KI270913v1_alt
chrY

# 更进一步,我们想知道每个染色体上有多少条记录,即有多少变异发生,可以结合sort与uniq来实现,

uniq -c, --count prefix lines by the number of occurrences # 计数

$ grep -v "^#" snp.vcf | cut -f 1 | sort | uniq -c
5843 chr1
2630 chr10
3199 chr11
2 chr11_JH159137v1_alt
1 chr11_KI270831v1_alt
1 chr11_KI270832v1_alt
19 chr11_KI270902v1_alt
2 chr11_KI270903v1_alt
13 chr11_KI270927v1_alt
2793 chr12
8 chr12_KI270835v1_alt
1361 chr13
1 chr13_KI270843v1_alt
1605 chr14
10 chr14_GL000009v2_random
28 chr14_GL000194v1_random
103 chr14_GL000225v1_random
3 chr14_KI270722v1_random
13 chr14_KI270723v1_random
13 chr14_KI270724v1_random
12 chr14_KI270725v1_random
6 chr14_KI270726v1_random
1 chr14_KI270845v1_alt
19 chr14_KI270846v1_alt
2004 chr15
3 chr15_KI270848v1_alt
4 chr15_KI270850v1_alt
6 chr15_KI270851v1_alt
11 chr15_KI270905v1_alt
2304 chr16
48 chr16_KI270728v1_random
10 chr16_KI270853v1_alt
2736 chr17
32 chr17_GL000205v2_random
1 chr17_GL000258v2_alt
1 chr17_GL383563v3_alt
2 chr17_JH159146v1_alt
1 chr17_JH159147v1_alt
1 chr17_JH159148v1_alt
17 chr17_KI270729v1_random
8 chr17_KI270730v1_random
6 chr17_KI270857v1_alt
1 chr17_KI270860v1_alt
1 chr17_KI270908v1_alt
3 chr17_KI270909v1_alt
994 chr18
2 chr18_GL383567v1_alt
3108 chr19
1 chr19_GL949753v2_alt
20 chr1_KI270706v1_random
20 chr1_KI270709v1_random
2 chr1_KI270710v1_random
36 chr1_KI270711v1_random
3 chr1_KI270712v1_random
33 chr1_KI270713v1_random
6 chr1_KI270714v1_random
8 chr1_KI270766v1_alt
4290 chr2
1505 chr20
900 chr21
1565 chr22
12 chr22_KI270731v1_random
12 chr22_KI270732v1_random
3 chr22_KI270733v1_random
6 chr22_KI270734v1_random
20 chr22_KI270735v1_random
1 chr22_KI270736v1_random
2 chr22_KI270875v1_alt
1 chr22_KI270878v1_alt
10 chr22_KI270879v1_alt
3 chr22_KI270928v1_alt
10 chr2_KI270894v1_alt
3226 chr3
18 chr3_GL000221v1_random
11 chr3_KI270779v1_alt
1 chr3_KI270924v1_alt
2600 chr4
6 chr4_GL000008v2_random
2400 chr5
1 chr5_GL000208v1_random
4 chr5_GL339449v2_alt
2 chr5_KI270792v1_alt
1 chr5_KI270898v1_alt
2483 chr6
22 chr6_GL000251v2_alt
5 chr6_GL000252v2_alt
10 chr6_GL000253v2_alt
3 chr6_GL000254v2_alt
14 chr6_GL000255v2_alt
7 chr6_GL000256v2_alt
3276 chr7
11 chr7_KI270803v1_alt
3 chr7_KI270809v1_alt
2007 chr8
5 chr8_KI270813v1_alt
4 chr8_KI270821v1_alt
2822 chr9
1 chr9_KI270718v1_random
35 chr9_KI270719v1_random
7 chr9_KI270720v1_random
16 chrM
5 chrUn_GL000214v1
17 chrUn_GL000216v2
17 chrUn_GL000218v1
61 chrUn_GL000219v1
1 chrUn_GL000220v1
5 chrUn_GL000224v1
1 chrUn_KI270322v1
2 chrUn_KI270330v1
2 chrUn_KI270435v1
40 chrUn_KI270438v1
59 chrUn_KI270442v1
4 chrUn_KI270466v1
13 chrUn_KI270467v1
1 chrUn_KI270507v1
4 chrUn_KI270516v1
6 chrUn_KI270519v1
1 chrUn_KI270538v1
25 chrUn_KI270742v1
8 chrUn_KI270743v1
14 chrUn_KI270744v1
3 chrUn_KI270745v1
47 chrUn_KI270746v1
1 chrUn_KI270747v1
3 chrUn_KI270749v1
7 chrUn_KI270750v1
11 chrUn_KI270754v1
871 chrX
2 chrX_KI270913v1_alt
126 chrY

# 由于涉及太多染色体,上面这个列表显得杂乱,我们可以根据变异发生频率来重新排序,即降序排列,因为sort默认升序排列,因此使用sort -n -r命令,

-n, --numeric-sort compare according to string numerical value # 按数字排列,默认升序
-r, --reverse reverse the result of comparisons # 倒序排列

$ grep -v "^#" snp.vcf | cut -f 1 | sort | uniq -c | sort -n -r
5843 chr1
4290 chr2
3276 chr7
3226 chr3
3199 chr11
3108 chr19
2822 chr9
2793 chr12
2736 chr17
2630 chr10
2600 chr4
2483 chr6
2400 chr5
2304 chr16
2007 chr8
2004 chr15
1605 chr14
1565 chr22
1505 chr20
1361 chr13
994 chr18
900 chr21
871 chrX
126 chrY
103 chr14_GL000225v1_random
61 chrUn_GL000219v1
59 chrUn_KI270442v1
48 chr16_KI270728v1_random
47 chrUn_KI270746v1
40 chrUn_KI270438v1
36 chr1_KI270711v1_random
35 chr9_KI270719v1_random
33 chr1_KI270713v1_random
32 chr17_GL000205v2_random
28 chr14_GL000194v1_random
25 chrUn_KI270742v1
22 chr6_GL000251v2_alt
20 chr22_KI270735v1_random
20 chr1_KI270709v1_random
20 chr1_KI270706v1_random
19 chr14_KI270846v1_alt
19 chr11_KI270902v1_alt
18 chr3_GL000221v1_random
17 chrUn_GL000218v1
17 chrUn_GL000216v2
17 chr17_KI270729v1_random
16 chrM
14 chrUn_KI270744v1
14 chr6_GL000255v2_alt
13 chrUn_KI270467v1
13 chr14_KI270724v1_random
13 chr14_KI270723v1_random
13 chr11_KI270927v1_alt
12 chr22_KI270732v1_random
12 chr22_KI270731v1_random
12 chr14_KI270725v1_random
11 chrUn_KI270754v1
11 chr7_KI270803v1_alt
11 chr3_KI270779v1_alt
11 chr15_KI270905v1_alt
10 chr6_GL000253v2_alt
10 chr2_KI270894v1_alt
10 chr22_KI270879v1_alt
10 chr16_KI270853v1_alt
10 chr14_GL000009v2_random
8 chrUn_KI270743v1
8 chr1_KI270766v1_alt
8 chr17_KI270730v1_random
8 chr12_KI270835v1_alt
7 chrUn_KI270750v1
7 chr9_KI270720v1_random
7 chr6_GL000256v2_alt
6 chrUn_KI270519v1
6 chr4_GL000008v2_random
6 chr22_KI270734v1_random
6 chr1_KI270714v1_random
6 chr17_KI270857v1_alt
6 chr15_KI270851v1_alt
6 chr14_KI270726v1_random
5 chrUn_GL000224v1
5 chrUn_GL000214v1
5 chr8_KI270813v1_alt
5 chr6_GL000252v2_alt
4 chrUn_KI270516v1
4 chrUn_KI270466v1
4 chr8_KI270821v1_alt
4 chr5_GL339449v2_alt
4 chr15_KI270850v1_alt
3 chrUn_KI270749v1
3 chrUn_KI270745v1
3 chr7_KI270809v1_alt
3 chr6_GL000254v2_alt
3 chr22_KI270928v1_alt
3 chr22_KI270733v1_random
3 chr1_KI270712v1_random
3 chr17_KI270909v1_alt
3 chr15_KI270848v1_alt
3 chr14_KI270722v1_random
2 chrX_KI270913v1_alt
2 chrUn_KI270435v1
2 chrUn_KI270330v1
2 chr5_KI270792v1_alt
2 chr22_KI270875v1_alt
2 chr1_KI270710v1_random
2 chr18_GL383567v1_alt
2 chr17_JH159146v1_alt
2 chr11_KI270903v1_alt
2 chr11_JH159137v1_alt
1 chrUn_KI270747v1
1 chrUn_KI270538v1
1 chrUn_KI270507v1
1 chrUn_KI270322v1
1 chrUn_GL000220v1
1 chr9_KI270718v1_random
1 chr5_KI270898v1_alt
1 chr5_GL000208v1_random
1 chr3_KI270924v1_alt
1 chr22_KI270878v1_alt
1 chr22_KI270736v1_random
1 chr19_GL949753v2_alt
1 chr17_KI270908v1_alt
1 chr17_KI270860v1_alt
1 chr17_JH159148v1_alt
1 chr17_JH159147v1_alt
1 chr17_GL383563v3_alt
1 chr17_GL000258v2_alt
1 chr14_KI270845v1_alt
1 chr13_KI270843v1_alt
1 chr11_KI270832v1_alt
1 chr11_KI270831v1_alt
# 这种排序方式明显对人类更加友好,我们也可以发现,常染色体中的变异远远多于性染色体,这是符合基本常识的,是人类得以传承不断的根本。
  • 本文作者:括囊无誉
  • 本文链接: WES/cancer_seq01/
  • 版权声明: 本博客所有文章均为原创作品,转载请注明出处!
------ 本文结束 ------
坚持原创文章分享,您的支持将鼓励我继续创作!