这次要对数据进行处理了。原代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | #!/usr/bin/perl #从键盘输入数据文件名并打开文件。其中<STDIN>指的就是标准输入设备。 print "please input the file name\n"; $filename = <STDIN>; open(MICROARRAY, $filename)||die ("Could not open file"); #将文件一次性读入内存。 @whole_file = <MICROARRAY>; #统计文件一共有多少行 $rows = scalar(@whole_file); print "there are $rows entries\n"; #统计文件一共有多少以TAB为间隔的列。 $pattern ="\t"; @cells = split(/$pattern/, $whole_file[0]); $cols = scalar(@cells); print "there are $cols columns\n"; #设置计入统计的表达差异的比例。 $times = 1; #设置脏标记用来退出程序。 $flag = 1; while($flag==1){ #用first_col和second_col来设置需要比较的列。 $first_col=0; $second_col=0; print "please set the columns you want to compare\n"; while ($first_col<=0 || $first_col>=$cols){ print "the first column is:\n"; $first_col = <STDIN>; } $first_col--; while ($second_col<=0 || $second_col>=$cols){ print "the second column is:\n"; $second_col = <STDIN>; } $second_col--; # $up_reg = 0; $down_reg = 0; @total_hash = (); @go_hash = (); $j = 0; print "gene_name\texp_ratio\n"; #对文件进行逐行处理。 for($i=0;$i<$rows;$i++){ #对每一行的内容按TAB间隔存到数组words当中去。 @words = split(/$pattern/, $whole_file[$i]); next if($words[$first_col]<=0 || $words[$second_col]<=0); $ratio = log($words[$first_col])- log( $words[$second_col]); #比较需要对比的两数据的差异,当差异大于两倍时进行处理。 if(abs($ratio)>=$times){ #对最后一列的注释进行处理,以空隔为分隔把内容存到description数组里去。 #当数据差异大于两倍时分为上调和下调分别进行统计。 @description = split(/\s+/,$words[$cols-1]); my %reg_gene = ("gene_name"=>$description[2],"exp_ratio"=>$ratio); push @total_hash, \%reg_gene; printf ("%s\t%8.4f\n",$reg_gene{gene_name},$reg_gene{exp_ratio}); $j++; $up_reg++ if($ratio>=$times); $down_reg++ if($ratio<=-$times); #查找GO信息,提取goid,并把每一个goid和对应的基因名存入哈希表go_hash当中去。 while($words[$cols-1]=~/\[goid\s(GO:\w+)\]/g){ $go_id = $1; my %go = ("gene_name"=>$description[2],"exp_ratio"=>$ratio,"go_id"=>$go_id); push @go_hash, \%go; } } } print "totally $j genes affected\n"; print $up_reg." genes up regulated. And ".$down_reg." genes down regulated.\n"; print "the GO info list as follows\n"; print "go_id\tgene_name\n"; #对存有go_id的哈希表go_hash进行排序,便于统计每个go_id一共有多少基因关联。 @go_sort = sort {$a->{go_id} cmp $b->{go_id}} @go_hash; #统计一共有多少个go_id。 $sample_n = scalar(@go_sort); #从GO数据库调取库中酿酒酵母的基因总数 $sample_N = &get_N(); #准备上调基因和下调基因哈希表。 %go_list_up = (); %go_list_down = (); #对每一个go_id进行统计 for($i=0;$i<$sample_n;$i++){ $tmp = $go_sort[$i]; %path_t = %$tmp; #如果是上调,则把它压入上调的哈希统计表当中去,反之亦然。 $go_list_up{$path_t{go_id}} +=1 if ($path_t{exp_ratio}>=$times); $go_list_down{$path_t{go_id}} +=1 if ($path_t{exp_ratio}<=-$times); print "$path_t{go_id}\t$path_t{gene_name}\n"; } #计算并显示所有上调和下调的GO统计。 &go_sql(1,$sample_N,$j,%go_list_up); &go_sql(0,$sample_N,$j,%go_list_down); print "\n\n0 for exit\n1 for continue\n"; $flag = <STDIN>; } close(MICROARRAY); print "\nThis is the end!"; #从GO数据库当中调取属于酿酒酵母的基因总数 sub get_N{ use DBI; $username = 'go_select';$password = 'amigo';$database = 'go_latest'; $hostname = 'mysql.ebi.ac.uk';$port='4085'; $dbh = DBI->connect("dbi:mysql:database=$database;" . "host=$hostname;port=$port", $username, $password) or exit(1); #count(*)统计出有多少条记录,把gene_product数据表和species数据表以species_id相连,查找到所有符合种属为酿酒酵母的记录条数。 $SQL = "SELECT count(*) AS n". " FROM gene_product AS p". " INNER JOIN species AS s ON (p.species_id=s.id)". " WHERE s.genus='Saccharomyces'". " AND s.species='cerevisiae'"; $sth = $dbh->prepare($SQL); $sth->execute(); $sth->bind_columns(\$sample_no); while ($sth->fetch()){ $sample_No=$sample_no; } $sth->finish(); $dbh->disconnect(); return $sample_No; } #查询,计算,并显示统计信息 sub go_sql{ use DBI; my ($up_or_down,$N,$n,%list_sql)=@_; $username = 'go_select';$password = 'amigo';$database = 'go_latest'; $hostname = 'mysql.ebi.ac.uk';$port='4085'; $dbh = DBI->connect("dbi:mysql:database=$database;" . "host=$hostname;port=$port", $username, $password) or exit(1); if($up_or_down){ print "\n\nup regulated genes\n"; } else { print "\n\ndown regulated genes\n"; } print "\n\ngo_id\tm\tk\tN\tn\tp_value\tterm_type\n"; #对于记录中每条数据,都可以提取到两个数据,一是go_id,二是该go_id下有多少表达差异的基因。 while (($go_id_holder,$record) = each(%list_sql)){ #调取该go_id下属于酿酒酵母的基因总数,同时调取该go_id的注释。 $SQL= "SELECT gene_product_count.product_count, term.name". " FROM term ". " INNER JOIN gene_product_count ON (term.id=gene_product_count.term_id)". " INNER JOIN species ON (gene_product_count.species_id=species.id)". " WHERE term.acc='$go_id_holder'". " AND species.genus='Saccharomyces'". " AND species.species='cerevisiae'"; $sth = $dbh->prepare($SQL); $sth->execute(); $sth->bind_columns(\$product_count,\$term_type); while($sth->fetch()){ #计算p_value。这个值的计算方法还不确认是否正确,但我们实验室的人以前都是用的这个公式,不过与文献方法还是有差异。实验室的人用的matlab内的库函数实现的,而且调用的是本地数据库,这两方面我都必须换成自己的方式。 $p_value=&phyper($N,$product_count, $n, $record); print "$go_id_holder\t$product_count\t$record\t$N\t$n\t$p_value\t$term_type\n"; } $sth->finish(); } $dbh->disconnect(); } #超几何统计。超几何统计的原理可以百度google一下。 sub phyper{ ($N,$m,$n,$k) = @_; @x = (0, $m+1, $k+1, $N-$m+1, $n-$k+1, $N+1, $n+1); if($x[2]>$x[1]){ $e=0; } else{ $e = &gammln($x[1]) - &gammln($x[2]) - &gammln($x[1]-$x[2]+1) + &gammln($x[3]) - &gammln($x[4]) - &gammln($x[3]-$x[4]+1) - &gammln($x[5]) + &gammln($x[6]) + &gammln($x[5]-$x[6]+1); $e = exp($e); } return $e; } #用于计算大数阶乘的近似算法,原程序来自numerical recipes in C第六单第一节。 sub gammln { my ($xx) = @_; my ($x,$y,$tmp,$ser,$eye)=(0,0,0,0,0); my @cof = (76.18009172947146,-86.50532032941677, 24.01409824083091,-1.231739572450155, 0.1208650973866179e-1,-0.5395239384953e-5); $y=$x=$xx; $tmp = $x+5.5; $tmp -= ($x+0.5)*log($tmp); $ser=1.000000000190015; for ($eye=0;$eye<=5;$eye++){ $ser += $cof[$eye]/++$y; } return -$tmp+log(2.506628746310005*$ser/$x); } |
运行以后,得到结果:
c:\\testfile.txt
there are 5716 entries
there are 7 columns
please set the columns you want to compare
the first column is:
1
the second column is:
3
gene_name exp_ratio
YAR066W 1.4417
YBR001C 3.9417
YCR099C -2.1969
YDL220C -1.1541
YDR001C 1.3240
YDR169C-A -1.2401
YDR194W-A 2.8262
YDR536W -2.4489
YEL069C 1.0892
YFL067W 1.4406
YFL056C -2.1925
YGR068C 1.5688
YGR126W 2.7582
YHR022C-A -3.3468
YHR189W -1.3869
YHR213W 2.3182
YHR213W-A -1.3413
YJR115W 2.3918
YKL106C-A -1.1337
YLR234W 1.1069
YLR385C -1.1090
YML100W-A -3.0568
YMR063W -3.0092
YMR084W 1.9071
YMR242W-A 1.5251
YMR244W 1.2473
YMR317W -1.3489
YNL034W 1.0815
YNL018C -1.2909
YNR075W 1.1509
YOL164W-A -1.4283
YOL141W -1.5695
YOR202W 1.0338
YOR214C -1.0922
YPL200W -1.4306
YPL021W 3.3656
totally 36 genes affected
18 genes up regulated. And 18 genes down regulated.
the GO info list as follows
go_id gene_name
GO:0000004 YAR066W
GO:0000004 YCR099C
GO:0000004 YDR169C-A
GO:0000004 YDR194W-A
GO:0000004 YFL067W
GO:0000004 YGR068C
GO:0000004 YGR126W
GO:0000004 YHR022C-A
GO:0000004 YHR213W
GO:0000004 YHR213W-A
GO:0000004 YJR115W
GO:0000004 YKL106C-A
GO:0000004 YML100W-A
GO:0000004 YMR084W
GO:0000004 YMR242W-A
GO:0000004 YMR244W
GO:0000004 YMR317W
GO:0000004 YNL034W
GO:0000004 YNL018C
GO:0000004 YOL164W-A
GO:0000004 YOL141W
GO:0000004 YOR214C
GO:0000018 YLR234W
GO:0000105 YOR202W
GO:0000723 YDL220C
GO:0000783 YDL220C
GO:0000812 YLR385C
GO:0003697 YDL220C
GO:0003880 YOL141W
GO:0003917 YLR234W
GO:0004045 YHR189W
GO:0004045 YHR189W
GO:0004424 YOR202W
GO:0004555 YBR001C
GO:0004555 YDR001C
GO:0005215 YDR536W
GO:0005353 YEL069C
GO:0005355 YEL069C
GO:0005554 YAR066W
GO:0005554 YCR099C
GO:0005554 YDR169C-A
GO:0005554 YDR194W-A
GO:0005554 YFL067W
GO:0005554 YGR068C
GO:0005554 YGR126W
GO:0005554 YHR022C-A
GO:0005554 YHR213W
GO:0005554 YHR213W-A
GO:0005554 YJR115W
GO:0005554 YKL106C-A
GO:0005554 YLR385C
GO:0005554 YML100W-A
GO:0005554 YMR063W
GO:0005554 YMR084W
GO:0005554 YMR242W-A
GO:0005554 YMR244W
GO:0005554 YMR317W
GO:0005554 YNL034W
GO:0005554 YNL018C
GO:0005554 YNR075W
GO:0005554 YOL164W-A
GO:0005554 YOR214C
GO:0005554 YPL200W
GO:0005554 YPL021W
GO:0005623 YOR202W
GO:0005634 YGR126W
GO:0005634 YLR234W
GO:0005634 YLR385C
GO:0005635 YNR075W
GO:0005737 YBR001C
GO:0005737 YDR001C
GO:0005737 YGR126W
GO:0005737 YNR075W
GO:0005737 YOL141W
GO:0005739 YBR001C
GO:0005739 YHR189W
GO:0005783 YNR075W
GO:0005789 YPL200W
GO:0005829 YDR001C
GO:0005886 YEL069C
GO:0005993 YBR001C
GO:0005993 YDR001C
GO:0006081 YFL056C
GO:0006338 YLR385C
GO:0006412 YHR189W
GO:0006412 YHR189W
GO:0006810 YDR536W
GO:0006897 YNR075W
GO:0006950 YBR001C
GO:0006950 YDR001C
GO:0007004 YLR234W
GO:0007124 YPL021W
GO:0007131 YLR234W
GO:0007151 YMR063W
GO:0008372 YAR066W
GO:0008372 YCR099C
GO:0008372 YDR169C-A
GO:0008372 YDR194W-A
GO:0008372 YFL067W
GO:0008372 YFL056C
GO:0008372 YGR068C
GO:0008372 YHR022C-A
GO:0008372 YHR213W
GO:0008372 YHR213W-A
GO:0008372 YJR115W
GO:0008372 YKL106C-A
GO:0008372 YML100W-A
GO:0008372 YMR063W
GO:0008372 YMR084W
GO:0008372 YMR242W-A
GO:0008372 YMR244W
GO:0008372 YMR317W
GO:0008372 YNL034W
GO:0008372 YNL018C
GO:0008372 YOL164W-A
GO:0008372 YPL021W
GO:0008645 YEL069C
GO:0009277 YOR214C
GO:0015578 YEL069C
GO:0016020 YDR536W
GO:0016233 YDL220C
GO:0018456 YFL056C
GO:0045132 YPL200W
up regulated genes
go_id m k N n p_value term_type
GO:0007124 66 1 6353 36 0.260630529090585 pseudohyphal growth
GO:0005783 362 1 6353 36 0.263091750103096 endoplasmic reticulum
GO:0003917 2 1 6353 36 0.0112693357798244 DNA topoisomerase type I activity
GO:0005355 16 1 6353 36 0.0834322925649666 glucose transmembrane transporter activity
GO:0005635 119 1 6353 36 0.349136126864685 nuclear envelope
GO:0006950 725 2 6353 36 0.133013426246829 response to stress
GO:0005737 3767 4 6353 36 2.13652063162925e-009 cytoplasm
GO:0005353 16 1 6353 36 0.0834322925649666 fructose transmembrane transporter activity
GO:0007131 47 1 6353 36 0.20633401909837 reciprocal meiotic recombination
GO:0000105 9 1 6353 36 0.0487849073175423 histidine biosynthetic process
GO:0005623 5503 1 6353 36 4.51018920013834e-030 cell
GO:0005829 669 1 6353 36 0.0767188678272437 cytosol
GO:0005993 3 2 6353 36 9.31557003089633e-005 trehalose catabolic process
GO:0004424 1 1 6353 36 0.00566595370720149 imidazoleglycerol-phosphate dehydratase activity
GO:0006897 86 1 6353 36 0.303680261410895 endocytosis
GO:0007004 24 1 6353 36 0.119721549380159 telomere maintenance via telomerase
GO:0005739 1101 1 6353 36 0.00787011016149571 mitochondrion
GO:0005886 281 1 6353 36 0.327203589275347 plasma membrane
GO:0005634 2047 2 6353 36 0.000114542934231066 nucleus
GO:0004555 3 2 6353 36 9.31557003089633e-005 alpha,alpha-trehalase activity
GO:0015578 16 1 6353 36 0.0834322925649666 mannose transmembrane transporter activity
GO:0008645 25 1 6353 36 0.124020300386809 hexose transport
GO:0000018 18 1 6353 36 0.092827278325293 regulation of DNA recombination
down regulated genes
go_id m k N n p_value term_type
GO:0000812 13 1 6353 36 0.0689241803092536 Swr1 complex
GO:0016020 1132 1 6353 36 0.00657295481861142 membrane
GO:0006338 82 1 6353 36 0.296112705758374 chromatin remodeling
GO:0018456 8 1 6353 36 0.0436049867229687 aryl-alcohol dehydrogenase activity
GO:0005737 3767 1 6353 36 4.07154801409779e-013 cytoplasm
GO:0016233 6 1 6353 36 0.0330676805344116 telomere capping
GO:0003697 35 1 6353 36 0.164254269864872 single-stranded DNA binding
GO:0006081 40 1 6353 36 0.182575358417541 cellular aldehyde metabolic process
GO:0005739 1101 1 6353 36 0.00787011016149571 mitochondrion
GO:0003880 3 1 6353 36 0.0168106661694704 C-terminal protein carboxyl methyltransferase activity
GO:0006810 1041 1 6353 36 0.0110884183808627 transport
GO:0005634 2047 1 6353 36 1.36670595958824e-005 nucleus
GO:0004045 2 2 6353 36 3.12197315893134e-005 aminoacyl-tRNA hydrolase activity
GO:0009277 93 1 6353 36 0.315767321253254 fungal-type cell wall
GO:0006412 1084 2 6353 36 0.0314220600943508 translation
GO:0000783 10 1 6353 36 0.0539063140510967 nuclear telomere cap complex
GO:0005789 142 1 6353 36 0.365929789993477 endoplasmic reticulum membrane
GO:0000723 69 1 6353 36 0.267951287295916 telomere maintenance
GO:0005215 387 1 6353 36 0.242865439805322 transporter activity
GO:0045132 27 1 6353 36 0.132464273446732 meiotic chromosome segregation
0 for exit
1 for continue
This is the end!
从结果上来看,似乎是都显示出来了,该算的也算出来了。嗯,大约可以了。
GO数据库是一个DAG(directed acyclic graph)结构数据库,下一步的学习就是把如何把各个go_id用树状图形象化显示。