GO数据库学习笔记(4)

上一次学习了什么DAGs,这次我想实践一下学习的东西。
题目:对于任意给出的属于某属的多个基因,用树状图表示这些基因的功能所属关系。
实例程序如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/perl
#使用DBD-mysql联接gene ontology数据库。
use DBI;
$username = 'go_select';$password = 'amigo';$database = 'go_latest';
$hostname = 'mysql.ebi.ac.uk';$port='4085';
$dbh = DBI->connect("dbi:mysql:database=$database;host=$hostname;port=$port", $username, $password) or exit(1);
#随机给出一些基因,基因的列表越长,所需时间自然也就越长,显示结果也需要太多的位置,
#所以这里只示例只有五个基因的情况。
#这些基因都来自酿酒酵母。
@gene_list =('AGA2','AIM1','BUD31','BUD32','BUD4');
#sql语句。从gene_product表(这个表内是基因和基因产物相关的记录)出发,
#以gene_product_id为关键字连接association表(gene_product与term的关系表,表中is_not是很有用的信息,如果它的值为真,则表示这个关系没有有效的证据支持。),
#以species_id为关键字连接species表,用species表中的genus和species项来确认数据是属于酿酒酵母的,
#用gene_product表中的symbol项来匹配基因列表中的基因名,用binary来表示基因名区分大小写,
#返回的结果只提取不重复的symbol(基因名),term_id(GO term ID)和product_count(该属中属于该term ID的所有基因的个数)。
$SQL = "SELECT DISTINCT g.symbol,t.term_id,t.product_count FROM gene_product AS g INNER JOIN association AS a ON (g.id=a.gene_product_id) INNER JOIN gene_product_count AS t ON (a.term_id=t.term_id) INNER JOIN species AS s ON (t.species_id=s.id) WHERE s.genus='Saccharomyces' AND s.species='cerevisiae'  AND a.is_not=0 AND BINARY g.symbol IN ('";
$SQL .=join ("','",@gene_list);
$SQL .= "')";
$sth = $dbh->prepare($SQL);
$sth->execute();
#把返回的结果全部都存到db二维数组当中去。
@db = ();
while (@ary = $sth->fetchrow_array()){
my %gene_entry = ("symbol"=>$ary[0],"term_id"=>$ary[1],"count"=>$ary[2]);
push @db, [$gene_entry{symbol},$gene_entry{term_id},$gene_entry{count}];
}
#对db二维数组当中的数据以term_id为索引做统计。
#用哈希表count_list保存某term_id下返回的基因总数(product_count)
%count_list=();
#用哈希表count保存这次所调查的基因中属于某term_id的基因数。
%count=();
#用哈希表gene_list保存属于某term_id的基因名。
%gene_list=();
$sql = "";
for $aref(@db){
$sql.="'@$aref[1]'," if ($sql !~ /@$aref[1]/);
$count_list{@$aref[1]}=@$aref[2];
$count{@$aref[1]}+=1;
$gene_list{@$aref[1]}.=@$aref[0].",";
}
#建立SQL语句,返回所有term_id对应的term_name。
$sql_t = "SELECT term.id, term.name FROM term WHERE term.id IN (".$sql."'1')";
#建立SQL语句,返回所有term_id的所属关系和步长。
$sql = "SELECT t.term1_id,t.term2_id,t.distance FROM graph_path AS t WHERE t.term1_id IN (".$sql."'1') AND t.term2_id IN (".$sql."'1') AND t.term1_id<>t.term2_id";
$sth_p = $dbh->prepare($sql);
$sth_p->execute();
#将返回的所属关系和步长保存到graph_path二维数组当中。
@graph_path=();
while(@ary=$sth_p->fetchrow_array()){
my %term2term=("term1_id"=>$ary[0],"term2_id"=>$ary[1],"distance"=>$ary[2]);
push @graph_path, [$term2term{term1_id},$term2term{term2_id},$term2term{distance}];
#  print join ("\t",@ary),"\n";
}
#对graph_path二维数组以步长升序排序。
@path_sort = sort {@$a[2] <=> @$b[2]} @graph_path;
$sth_t = $dbh->prepare($sql_t);
$sth_t->execute();
#将返回的term_id和term_name对应列表保存到哈希表term_list当中去。
%term_list=();
while(@ary=$sth_t->fetchrow_array()){
$term_list{$ary[0]}=$ary[1];
}
#对排序后的path_sort列表以term_id为关键字增加term_name,某term_id的基因数,某term_id下返回的基因总数(product_count),某term_id下的基因名,和一个脏标记。
for($i=0;$i<@path_sort;$i++){
$path_sort[$i][3]=$term_list{$path_sort[$i][0]};
$path_sort[$i][4]=$term_list{$path_sort[$i][1]};
$path_sort[$i][5]=$count{$path_sort[$i][1]};
$path_sort[$i][6]=$count_list{$path_sort[$i][1]};
$path_sort[$i][7]=$gene_list{$path_sort[$i][1]};
$path_sort[$i][8]=0;
}
#for $aref(@path_sort){
#   print "@$aref,\n";
#}
#对path_sort二维数组中的数据依据所属关系重新保存到tree_view表当中去
print "\nroot.\n";
&draw_path(1,0,\@path_sort,\@tree_view);
#合并统计相关数据
for $hit(@tree_view){
@daughter_gene_ary = split(/,/,$gene_list{$hit->{this_id}});
@parent_gene_ary = split(/,/,$gene_list{$hit->{parent_id}}) if($hit->{parent_id}!=1);
@parent_gene_ary = (@parent_gene_ary,@daughter_gene_ary);
%screen = ();
@parent_ary = grep (!$screen{$_}++, @parent_gene_ary);
$gene_list{$hit->{parent_id}}= join (",",@parent_ary);
for $par(@tree_view){
if ($par->{this_id}==$hit->{parent_id}){
$par->{genes_name}=$gene_list{$par->{this_id}};
$count{$par->{this_id}}=@parent_ary;
$par->{this_test}=$count{$par->{this_id}};
}
}
}
#画出树状图。
for $hit(@tree_view){
print " " x $hit->{pre_distance};
$temp_l=length($hit->{parent_name})-1;
print " " x $temp_l;
print "|";
print "-" x $hit->{this_distance};
print "$hit->{term_name}(step:$hit->{this_distance})($hit->{this_test}/$hit->{total})($hit->{genes_name})\n";
}
 
$sth_p->finish();
$sth_t->finish();
$sth->finish();
$dbh->disconnect();
print "\n\nthis is the END!";
 
#层进递归生成tree_view数组,用以生成树状图
sub draw_path{
my ($parent,$distance,$path_t,$tree_v)=@_;
my ($sep) = "-";
my ($pre) = $sep x $distance;
for(my $i=0;$i<@$path_t;$i++){
if($$path_t[$i][0]==$parent){
my $flag = 1;
$flag = 0 if(&get_step($$path_t[$i][0],$$path_t[$i][1],0,$$path_t[$i][2],$path_t));
if($flag){
$$path_t[$i][8]=1;
my %tree = ("parent_id"=>$$path_t[$i][0],"this_id"=>$$path_t[$i][1],"pre_distance"=>$distance,"this_distance"=>$$path_t[$i][2],"parent_name"=>$$path_t[$i][3],"term_name"=>$$path_t[$i][4],"this_test"=>$$path_t[$i][5],"total"=>$$path_t[$i][6],"genes_name"=>$$path_t[$i][7]);
push @$tree_v,\%tree;
my $out ="$pre$$path_t[$i][3]";
$out .=  $sep x $$path_t[$i][2];
draw_path($$path_t[$i][1],length($out),$path_t,$tree_v);
}
}
}
}
 
#防止tree_view当中产生重复数据的递归函数。
sub get_step{
my($parent,$daughter,$step,$step_t,$path_t)=@_;
my $flag;
for(my $i=0; $i<@$path_t;$i++){
if($$path_t[$i][8]&&$$path_t[$i][1]==$daughter){
$step += $$path_t[$i][2];
if($$path_t[$i][0]==$parent){
if ($step==$step_t){
return $flag=1;
}#if
else{
$step=0;
next;
}#else
}#if
elsif($$path_t[$i][0]==1){
$step=0;
next;
}#elsif
else{
if(get_step($parent,$$path_t[$i][0],$step,$step_t,$path_t)){
return $flag = 1;
}
else{
$step=0;
next;
}
}#else
}#if
}#for
return $flag;
}

两个递归没把我的头想破了……递归怎么就这么绕呢?从一开始C的时候学递归就学不利索。
现在看一下结果吧:

root.
|-molecular_function(step:1)(4/6353)(AIM1,BUD32,BUD31,AGA2)
|–nucleotide binding(step:2)(2/55)(BUD32,BUD4)
|—-ATP binding(step:4)(1/22)(BUD32,)
|—-GTP binding(step:4)(1/20)(BUD4,)
|–RNA splicing factor activity, transesterification mechanism(step:2)(1/64)(BUD31,)
|–transferase activity(step:2)(1/727)(BUD32)
|–kinase activity(step:2)(1/204)(BUD32)
|-protein kinase activity(step:1)(1/133)(BUD32)
|-protein serine/threonine kinase activity(step:1)(1/93)(BUD32,)
|-protein tyrosine kinase activity(step:1)(1/1)(BUD32,)
|—cell adhesion molecule binding(step:3)(1/3)(AGA2,)
|—sugar binding(step:3)(1/8)(AIM1,)
|—-metal ion binding(step:4)(1/39)(BUD31)
|–zinc ion binding(step:2)(1/7)(BUD31,)
|-cellular_component(step:1)(5/6353)(AIM1,BUD32,BUD31,BUD4,AGA2)
|—EKC/KEOPS protein complex(step:3)(1/5)(BUD32,)
|—cell wall(step:3)(2/103)(AIM1,AGA2)
|-fungal-type cell wall(step:1)(1/93)(AGA2,)
|—cytoplasm(step:3)(3/3767)(BUD32,AIM1,BUD4)
|—peroxisome(step:3)(1/62)(AIM1,)
|—-cellular bud neck contractile ring(step:4)(1/14)(BUD4,)
|—–cellular bud neck contractile ring(step:5)(1/14)(BUD4,)
|—spliceosomal complex(step:3)(1/53)(BUD31,)
|—cellular bud neck(step:3)(1/134)(BUD4)
|-cellular bud neck contractile ring(step:1)(1/14)(BUD4,)
|—-EKC/KEOPS protein complex(step:4)(1/5)(BUD32,)
|—-chromosome, telomeric region(step:4)(1/30)(BUD32,)
|—-cell wall(step:4)(2/103)(AIM1,AGA2)
|—-nucleus(step:4)(2/2047)(BUD31,BUD32)
|–spliceosomal complex(step:2)(1/53)(BUD31,)
|—U2 snRNP(step:3)(1/16)(BUD31,)
|—-cytoplasm(step:4)(3/3767)(BUD32,AIM1,BUD4)
|—-spliceosomal complex(step:4)(1/53)(BUD31,)
|—-U2 snRNP(step:4)(1/16)(BUD31,)
|—-chromosome(step:4)(1/325)(BUD32)
|–chromosome, telomeric region(step:2)(1/30)(BUD32,)
|—-cellular bud neck(step:4)(1/134)(BUD4)
|—–cellular bud neck contractile ring(step:5)(1/14)(BUD4,)
|—–EKC/KEOPS protein complex(step:5)(1/5)(BUD32,)
|—–chromosome, telomeric region(step:5)(1/30)(BUD32,)
|—–peroxisome(step:5)(1/62)(AIM1,)
|—–nucleus(step:5)(2/2047)(BUD31,BUD32)
|—–cytoplasm(step:5)(3/3767)(BUD32,AIM1,BUD4)
|—–chromosome(step:5)(1/325)(BUD32)
|—–fungal-type cell wall(step:5)(1/93)(AGA2,)
|——cellular bud neck contractile ring(step:6)(1/14)(BUD4,)
|——nucleus(step:6)(2/2047)(BUD31,BUD32)
|——chromosome(step:6)(1/325)(BUD32)
|——-chromosome, telomeric region(step:7)(1/30)(BUD32,)
|——-peroxisome(step:7)(1/62)(AIM1,)
|——-nucleus(step:7)(2/2047)(BUD31,BUD32)
|——-chromosome(step:7)(1/325)(BUD32)
|——–chromosome, telomeric region(step:8)(1/30)(BUD32,)
|——–peroxisome(step:8)(1/62)(AIM1,)
|——–spliceosomal complex(step:8)(1/53)(BUD31,)
|——–U2 snRNP(step:8)(1/16)(BUD31,)
|———cellular bud neck contractile ring(step:9)(1/14)(BUD4,)
|———chromosome, telomeric region(step:9)(1/30)(BUD32,)
|———spliceosomal complex(step:9)(1/53)(BUD31,)
|———U2 snRNP(step:9)(1/16)(BUD31,)
|———-cellular bud neck contractile ring(step:10)(1/14)(BUD4,)
|———-U2 snRNP(step:10)(1/16)(BUD31,)
|-biological_process(step:1)(5/6353)(AIM1,AGA2,BUD31,BUD4,BUD32)
|–conjugation(step:2)(1/126)(AGA2)
|–agglutination involved in conjugation with cellular fusion(step:2)(1/4)(AGA2,)
|—agglutination involved in conjugation with cellular fusion(step:3)(1/4)(AGA2,)
|–cell adhesion(step:2)(1/8)(AGA2)
|—-agglutination involved in conjugation with cellular fusion(step:4)(1/4)(AGA2,)
|–cell cycle(step:2)(2/487)(BUD31,BUD4,)
|–sexual reproduction(step:2)(1/233)(AGA2)
|—agglutination involved in conjugation with cellular fusion(step:3)(1/4)(AGA2,)
|–cell division(step:2)(3/131)(BUD4,BUD31,BUD32)
|—-cellular bud site selection(step:4)(3/67)(BUD31,BUD32,BUD4)
|-axial cellular bud site selection(step:1)(1/21)(BUD4,)
|—response to pheromone(step:3)(1/97)(AGA2)
|–agglutination involved in conjugation with cellular fusion(step:2)(1/4)(AGA2,)
|—cellular metabolic compound salvage(step:3)(1/158)(BUD32,)
|—-cellular bud site selection(step:4)(3/67)(BUD31,BUD32,BUD4)
|—-transcription(step:4)(1/653)(BUD32)
|–regulation of transcription, DNA-dependent(step:2)(1/399)(BUD32)
|–positive regulation of transcription from RNA polymerase II promoter(step:2)(1/93)(BUD32,)
|—positive regulation of transcription from RNA polymerase II promoter(step:3)(1/93)(BUD32,)
|—-axial cellular bud site selection(step:4)(1/21)(BUD4,)
|—–telomere maintenance(step:5)(1/69)(BUD32,)
|—–regulation of transcription, DNA-dependent(step:5)(1/399)(BUD32)
|—–RNA splicing(step:5)(1/137)(BUD31)
|—nuclear mRNA splicing, via spliceosome(step:3)(1/105)(BUD31,)
|—–RNA modification(step:5)(1/296)(BUD31,)
|—–mRNA metabolic process(step:5)(1/233)(BUD31)
|–nuclear mRNA splicing, via spliceosome(step:2)(1/105)(BUD31,)
|——nuclear mRNA splicing, via spliceosome(step:6)(1/105)(BUD31,)
|——telomere maintenance(step:6)(1/69)(BUD32,)
|——transcription(step:6)(1/653)(BUD32)
|——tRNA metabolic process(step:6)(1/213)(BUD31,)
|——protein amino acid phosphorylation(step:6)(1/109)(BUD32,)
|——RNA splicing(step:6)(1/137)(BUD31)
|——RNA modification(step:6)(1/296)(BUD31,)
|——mRNA metabolic process(step:6)(1/233)(BUD31)
|——-cellular bud site selection(step:7)(3/67)(BUD31,BUD32,BUD4)
|——-regulation of transcription, DNA-dependent(step:7)(1/399)(BUD32)
|——-tRNA metabolic process(step:7)(1/213)(BUD31,)
|——-protein amino acid phosphorylation(step:7)(1/109)(BUD32,)
|——-RNA splicing(step:7)(1/137)(BUD31)
|——–cellular bud site selection(step:8)(3/67)(BUD31,BUD32,BUD4)
|——–regulation of transcription, DNA-dependent(step:8)(1/399)(BUD32)
|——–protein amino acid phosphorylation(step:8)(1/109)(BUD32,)
|———nuclear mRNA splicing, via spliceosome(step:9)(1/105)(BUD31,)
|———axial cellular bud site selection(step:9)(1/21)(BUD4,)
|———positive regulation of transcription from RNA polymerase II promoter(step:9)(1/93)(BUD32,)
|———-nuclear mRNA splicing, via spliceosome(step:10)(1/105)(BUD31,)
|———-positive regulation of transcription from RNA polymerase II promoter(step:10)(1/93)(BUD32,)

this is the END!

结果的显示当中,前面有term_name,然后就是和上一层的步长距离(表示实际上未显示的还有多少层),然后是这次的基因数统计/该类中的总基因数,最后是该term_name下的基因名称。
一切OK。 GO数据库就学到这里了。当然还有很多没有学,比如说如何利用GO数据库和别的数据库联合分析数据。这个问题以后有机会再讨论。下一步,也许是MIPS(institute for bioinformatics and systems biology)数据库。这个数据库可能得换java了。

发表评论

电子邮件地址不会被公开。 必填项已用*标注