GO数据库学习笔记(2)

这次要对数据进行处理了。原代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/perl
#从键盘输入数据文件名并打开文件。其中<STDIN>指的就是标准输入设备。
print "please input the file name\n";
$filename = <STDIN>;
open(MICROARRAY, $filename)||die ("Could not open file");
#将文件一次性读入内存。
@whole_file = <MICROARRAY>;
#统计文件一共有多少行
$rows = scalar(@whole_file);
print "there are $rows entries\n";
#统计文件一共有多少以TAB为间隔的列。
$pattern ="\t";
@cells = split(/$pattern/, $whole_file[0]);
$cols = scalar(@cells);
print "there are $cols columns\n";
#设置计入统计的表达差异的比例。
$times = 1;
#设置脏标记用来退出程序。
$flag = 1;
while($flag==1){
#用first_col和second_col来设置需要比较的列。
$first_col=0;
$second_col=0;
print "please set the columns you want to compare\n";
while ($first_col<=0 || $first_col>=$cols){
print "the first column is:\n";
$first_col = <STDIN>;
}
$first_col--;
 
while ($second_col<=0 || $second_col>=$cols){
print "the second column is:\n";
$second_col = <STDIN>;
}
$second_col--;
#
$up_reg = 0;
$down_reg = 0;
@total_hash = ();
@go_hash = ();
$j = 0;
print "gene_name\texp_ratio\n";
#对文件进行逐行处理。
for($i=0;$i<$rows;$i++){
#对每一行的内容按TAB间隔存到数组words当中去。
@words = split(/$pattern/, $whole_file[$i]);
next if($words[$first_col]<=0 || $words[$second_col]<=0);
$ratio = log($words[$first_col])- log( $words[$second_col]);
#比较需要对比的两数据的差异,当差异大于两倍时进行处理。
if(abs($ratio)>=$times){
#对最后一列的注释进行处理,以空隔为分隔把内容存到description数组里去。
#当数据差异大于两倍时分为上调和下调分别进行统计。
@description = split(/\s+/,$words[$cols-1]);
my %reg_gene = ("gene_name"=>$description[2],"exp_ratio"=>$ratio);
push @total_hash, \%reg_gene;
printf ("%s\t%8.4f\n",$reg_gene{gene_name},$reg_gene{exp_ratio});
$j++;
$up_reg++ if($ratio>=$times);
$down_reg++ if($ratio<=-$times);
#查找GO信息,提取goid,并把每一个goid和对应的基因名存入哈希表go_hash当中去。
while($words[$cols-1]=~/\[goid\s(GO:\w+)\]/g){
$go_id = $1;
my %go = ("gene_name"=>$description[2],"exp_ratio"=>$ratio,"go_id"=>$go_id);
push @go_hash, \%go;
}
}
}
print "totally $j genes affected\n";
print $up_reg." genes up regulated. And ".$down_reg." genes down regulated.\n";
 
print "the GO info list as follows\n";
print "go_id\tgene_name\n";
#对存有go_id的哈希表go_hash进行排序,便于统计每个go_id一共有多少基因关联。
@go_sort = sort {$a->{go_id} cmp $b->{go_id}} @go_hash;
#统计一共有多少个go_id。
$sample_n = scalar(@go_sort);
#从GO数据库调取库中酿酒酵母的基因总数
$sample_N = &get_N();
#准备上调基因和下调基因哈希表。
%go_list_up = ();
%go_list_down = ();
#对每一个go_id进行统计
for($i=0;$i<$sample_n;$i++){
$tmp = $go_sort[$i];
%path_t = %$tmp;
#如果是上调,则把它压入上调的哈希统计表当中去,反之亦然。
$go_list_up{$path_t{go_id}} +=1 if ($path_t{exp_ratio}>=$times);
$go_list_down{$path_t{go_id}} +=1 if ($path_t{exp_ratio}<=-$times);
print "$path_t{go_id}\t$path_t{gene_name}\n";
}
#计算并显示所有上调和下调的GO统计。
&go_sql(1,$sample_N,$j,%go_list_up);
&go_sql(0,$sample_N,$j,%go_list_down);
 
print "\n\n0 for exit\n1 for continue\n";
$flag = <STDIN>;
}
 
close(MICROARRAY);
 
print "\nThis is the end!";
#从GO数据库当中调取属于酿酒酵母的基因总数
sub get_N{
use DBI;
$username = 'go_select';$password = 'amigo';$database = 'go_latest';
$hostname = 'mysql.ebi.ac.uk';$port='4085';
$dbh = DBI->connect("dbi:mysql:database=$database;" .
"host=$hostname;port=$port", $username, $password) or exit(1);
#count(*)统计出有多少条记录,把gene_product数据表和species数据表以species_id相连,查找到所有符合种属为酿酒酵母的记录条数。
$SQL = "SELECT count(*) AS n".
" FROM gene_product AS p".
" INNER JOIN species AS s ON (p.species_id=s.id)".
" WHERE s.genus='Saccharomyces'".
" AND s.species='cerevisiae'";
$sth = $dbh->prepare($SQL);
$sth->execute();
$sth->bind_columns(\$sample_no);
while ($sth->fetch()){
$sample_No=$sample_no;
}
$sth->finish();
$dbh->disconnect();
return $sample_No;
}
#查询,计算,并显示统计信息
sub go_sql{
use DBI;
my ($up_or_down,$N,$n,%list_sql)=@_;
$username = 'go_select';$password = 'amigo';$database = 'go_latest';
$hostname = 'mysql.ebi.ac.uk';$port='4085';
$dbh = DBI->connect("dbi:mysql:database=$database;" .
"host=$hostname;port=$port", $username, $password) or exit(1);
if($up_or_down){
print "\n\nup regulated genes\n";
}
else {
print "\n\ndown regulated genes\n";
}
print "\n\ngo_id\tm\tk\tN\tn\tp_value\tterm_type\n";
#对于记录中每条数据,都可以提取到两个数据,一是go_id,二是该go_id下有多少表达差异的基因。
while (($go_id_holder,$record) = each(%list_sql)){
#调取该go_id下属于酿酒酵母的基因总数,同时调取该go_id的注释。
$SQL= "SELECT gene_product_count.product_count, term.name".
" FROM term ".
" INNER JOIN gene_product_count ON (term.id=gene_product_count.term_id)".
" INNER JOIN species ON (gene_product_count.species_id=species.id)".
" WHERE term.acc='$go_id_holder'".
" AND species.genus='Saccharomyces'".
" AND species.species='cerevisiae'";
 
$sth = $dbh->prepare($SQL);
$sth->execute();
$sth->bind_columns(\$product_count,\$term_type);
while($sth->fetch()){
#计算p_value。这个值的计算方法还不确认是否正确,但我们实验室的人以前都是用的这个公式,不过与文献方法还是有差异。实验室的人用的matlab内的库函数实现的,而且调用的是本地数据库,这两方面我都必须换成自己的方式。
$p_value=&phyper($N,$product_count, $n, $record);
print "$go_id_holder\t$product_count\t$record\t$N\t$n\t$p_value\t$term_type\n";
}
$sth->finish();
}
$dbh->disconnect();
}
#超几何统计。超几何统计的原理可以百度google一下。
sub phyper{
($N,$m,$n,$k) = @_;
@x = (0, $m+1, $k+1, $N-$m+1, $n-$k+1, $N+1, $n+1);
if($x[2]>$x[1]){
$e=0;
}
else{
$e = &gammln($x[1]) - &gammln($x[2]) - &gammln($x[1]-$x[2]+1)
+ &gammln($x[3]) - &gammln($x[4]) - &gammln($x[3]-$x[4]+1)
- &gammln($x[5]) + &gammln($x[6]) + &gammln($x[5]-$x[6]+1);
$e = exp($e);
}
return $e;
}
#用于计算大数阶乘的近似算法,原程序来自numerical recipes in C第六单第一节。
sub gammln
{
my ($xx) = @_;
my ($x,$y,$tmp,$ser,$eye)=(0,0,0,0,0);
my @cof = (76.18009172947146,-86.50532032941677,
24.01409824083091,-1.231739572450155,
0.1208650973866179e-1,-0.5395239384953e-5);
$y=$x=$xx;
$tmp = $x+5.5;
$tmp -= ($x+0.5)*log($tmp);
$ser=1.000000000190015;
for ($eye=0;$eye<=5;$eye++){
$ser += $cof[$eye]/++$y;
}
return -$tmp+log(2.506628746310005*$ser/$x);
}

运行以后,得到结果:

please input the file name
c:\\testfile.txt
there are 5716 entries
there are 7 columns
please set the columns you want to compare
the first column is:
1
the second column is:
3
gene_name    exp_ratio
YAR066W      1.4417
YBR001C      3.9417
YCR099C     -2.1969
YDL220C     -1.1541
YDR001C      1.3240
YDR169C-A     -1.2401
YDR194W-A      2.8262
YDR536W     -2.4489
YEL069C      1.0892
YFL067W      1.4406
YFL056C     -2.1925
YGR068C      1.5688
YGR126W      2.7582
YHR022C-A     -3.3468
YHR189W     -1.3869
YHR213W      2.3182
YHR213W-A     -1.3413
YJR115W      2.3918
YKL106C-A     -1.1337
YLR234W      1.1069
YLR385C     -1.1090
YML100W-A     -3.0568
YMR063W     -3.0092
YMR084W      1.9071
YMR242W-A      1.5251
YMR244W      1.2473
YMR317W     -1.3489
YNL034W      1.0815
YNL018C     -1.2909
YNR075W      1.1509
YOL164W-A     -1.4283
YOL141W     -1.5695
YOR202W      1.0338
YOR214C     -1.0922
YPL200W     -1.4306
YPL021W      3.3656
totally 36 genes affected
18 genes up regulated. And 18 genes down regulated.
the GO info list as follows
go_id    gene_name
GO:0000004    YAR066W
GO:0000004    YCR099C
GO:0000004    YDR169C-A
GO:0000004    YDR194W-A
GO:0000004    YFL067W
GO:0000004    YGR068C
GO:0000004    YGR126W
GO:0000004    YHR022C-A
GO:0000004    YHR213W
GO:0000004    YHR213W-A
GO:0000004    YJR115W
GO:0000004    YKL106C-A
GO:0000004    YML100W-A
GO:0000004    YMR084W
GO:0000004    YMR242W-A
GO:0000004    YMR244W
GO:0000004    YMR317W
GO:0000004    YNL034W
GO:0000004    YNL018C
GO:0000004    YOL164W-A
GO:0000004    YOL141W
GO:0000004    YOR214C
GO:0000018    YLR234W
GO:0000105    YOR202W
GO:0000723    YDL220C
GO:0000783    YDL220C
GO:0000812    YLR385C
GO:0003697    YDL220C
GO:0003880    YOL141W
GO:0003917    YLR234W
GO:0004045    YHR189W
GO:0004045    YHR189W
GO:0004424    YOR202W
GO:0004555    YBR001C
GO:0004555    YDR001C
GO:0005215    YDR536W
GO:0005353    YEL069C
GO:0005355    YEL069C
GO:0005554    YAR066W
GO:0005554    YCR099C
GO:0005554    YDR169C-A
GO:0005554    YDR194W-A
GO:0005554    YFL067W
GO:0005554    YGR068C
GO:0005554    YGR126W
GO:0005554    YHR022C-A
GO:0005554    YHR213W
GO:0005554    YHR213W-A
GO:0005554    YJR115W
GO:0005554    YKL106C-A
GO:0005554    YLR385C
GO:0005554    YML100W-A
GO:0005554    YMR063W
GO:0005554    YMR084W
GO:0005554    YMR242W-A
GO:0005554    YMR244W
GO:0005554    YMR317W
GO:0005554    YNL034W
GO:0005554    YNL018C
GO:0005554    YNR075W
GO:0005554    YOL164W-A
GO:0005554    YOR214C
GO:0005554    YPL200W
GO:0005554    YPL021W
GO:0005623    YOR202W
GO:0005634    YGR126W
GO:0005634    YLR234W
GO:0005634    YLR385C
GO:0005635    YNR075W
GO:0005737    YBR001C
GO:0005737    YDR001C
GO:0005737    YGR126W
GO:0005737    YNR075W
GO:0005737    YOL141W
GO:0005739    YBR001C
GO:0005739    YHR189W
GO:0005783    YNR075W
GO:0005789    YPL200W
GO:0005829    YDR001C
GO:0005886    YEL069C
GO:0005993    YBR001C
GO:0005993    YDR001C
GO:0006081    YFL056C
GO:0006338    YLR385C
GO:0006412    YHR189W
GO:0006412    YHR189W
GO:0006810    YDR536W
GO:0006897    YNR075W
GO:0006950    YBR001C
GO:0006950    YDR001C
GO:0007004    YLR234W
GO:0007124    YPL021W
GO:0007131    YLR234W
GO:0007151    YMR063W
GO:0008372    YAR066W
GO:0008372    YCR099C
GO:0008372    YDR169C-A
GO:0008372    YDR194W-A
GO:0008372    YFL067W
GO:0008372    YFL056C
GO:0008372    YGR068C
GO:0008372    YHR022C-A
GO:0008372    YHR213W
GO:0008372    YHR213W-A
GO:0008372    YJR115W
GO:0008372    YKL106C-A
GO:0008372    YML100W-A
GO:0008372    YMR063W
GO:0008372    YMR084W
GO:0008372    YMR242W-A
GO:0008372    YMR244W
GO:0008372    YMR317W
GO:0008372    YNL034W
GO:0008372    YNL018C
GO:0008372    YOL164W-A
GO:0008372    YPL021W
GO:0008645    YEL069C
GO:0009277    YOR214C
GO:0015578    YEL069C
GO:0016020    YDR536W
GO:0016233    YDL220C
GO:0018456    YFL056C
GO:0045132    YPL200W

up regulated genes

go_id    m    k    N    n    p_value    term_type
GO:0007124    66    1    6353    36    0.260630529090585    pseudohyphal growth
GO:0005783    362    1    6353    36    0.263091750103096    endoplasmic reticulum
GO:0003917    2    1    6353    36    0.0112693357798244    DNA topoisomerase type I activity
GO:0005355    16    1    6353    36    0.0834322925649666    glucose transmembrane transporter activity
GO:0005635    119    1    6353    36    0.349136126864685    nuclear envelope
GO:0006950    725    2    6353    36    0.133013426246829    response to stress
GO:0005737    3767    4    6353    36    2.13652063162925e-009    cytoplasm
GO:0005353    16    1    6353    36    0.0834322925649666    fructose transmembrane transporter activity
GO:0007131    47    1    6353    36    0.20633401909837    reciprocal meiotic recombination
GO:0000105    9    1    6353    36    0.0487849073175423    histidine biosynthetic process
GO:0005623    5503    1    6353    36    4.51018920013834e-030    cell
GO:0005829    669    1    6353    36    0.0767188678272437    cytosol
GO:0005993    3    2    6353    36    9.31557003089633e-005    trehalose catabolic process
GO:0004424    1    1    6353    36    0.00566595370720149    imidazoleglycerol-phosphate dehydratase activity
GO:0006897    86    1    6353    36    0.303680261410895    endocytosis
GO:0007004    24    1    6353    36    0.119721549380159    telomere maintenance via telomerase
GO:0005739    1101    1    6353    36    0.00787011016149571    mitochondrion
GO:0005886    281    1    6353    36    0.327203589275347    plasma membrane
GO:0005634    2047    2    6353    36    0.000114542934231066    nucleus
GO:0004555    3    2    6353    36    9.31557003089633e-005    alpha,alpha-trehalase activity
GO:0015578    16    1    6353    36    0.0834322925649666    mannose transmembrane transporter activity
GO:0008645    25    1    6353    36    0.124020300386809    hexose transport
GO:0000018    18    1    6353    36    0.092827278325293    regulation of DNA recombination

down regulated genes

go_id    m    k    N    n    p_value    term_type
GO:0000812    13    1    6353    36    0.0689241803092536    Swr1 complex
GO:0016020    1132    1    6353    36    0.00657295481861142    membrane
GO:0006338    82    1    6353    36    0.296112705758374    chromatin remodeling
GO:0018456    8    1    6353    36    0.0436049867229687    aryl-alcohol dehydrogenase activity
GO:0005737    3767    1    6353    36    4.07154801409779e-013    cytoplasm
GO:0016233    6    1    6353    36    0.0330676805344116    telomere capping
GO:0003697    35    1    6353    36    0.164254269864872    single-stranded DNA binding
GO:0006081    40    1    6353    36    0.182575358417541    cellular aldehyde metabolic process
GO:0005739    1101    1    6353    36    0.00787011016149571    mitochondrion
GO:0003880    3    1    6353    36    0.0168106661694704    C-terminal protein carboxyl methyltransferase activity
GO:0006810    1041    1    6353    36    0.0110884183808627    transport
GO:0005634    2047    1    6353    36    1.36670595958824e-005    nucleus
GO:0004045    2    2    6353    36    3.12197315893134e-005    aminoacyl-tRNA hydrolase activity
GO:0009277    93    1    6353    36    0.315767321253254    fungal-type cell wall
GO:0006412    1084    2    6353    36    0.0314220600943508    translation
GO:0000783    10    1    6353    36    0.0539063140510967    nuclear telomere cap complex
GO:0005789    142    1    6353    36    0.365929789993477    endoplasmic reticulum membrane
GO:0000723    69    1    6353    36    0.267951287295916    telomere maintenance
GO:0005215    387    1    6353    36    0.242865439805322    transporter activity
GO:0045132    27    1    6353    36    0.132464273446732    meiotic chromosome segregation

0 for exit
1 for continue

This is the end!

从结果上来看,似乎是都显示出来了,该算的也算出来了。嗯,大约可以了。
GO数据库是一个DAG(directed acyclic graph)结构数据库,下一步的学习就是把如何把各个go_id用树状图形象化显示。

Leave a Reply

  

  

  

%d 博主赞过: