|||
我们知道在mRNA的genebank格式的序列文件中包括了非常多的信息,怎样在里面提取想要的信息呢,现在有一个sequence.gb文件,里面包括了二千条拟南芥mRNA的序列信息,现要提取每条mRNA的accession、mRNA长度、description、gene_name、Gene_id、locus_tag、CDS长度、CDS序列、protein_id、product、氨基酸序列、mRNA序列,花了我整整一天的时间终于完成了程序(记得刚进实验室时我花了一个月的时间,实验室好多同学帮忙,终于提取了一千个序列的信息,悄悄的告诉你——那时是笨笨的一个个复制),现来看一下运行结果:
#说明:那些碱基没有对齐是复制过来格式有点变化,在我的输出文件下是对齐的,这个空间太烂了,贴图太不方便,段落格式也不能设,更不要说加文本框和背景了,没办法,在自己主页没有建好之前,勉强用着吧。
Accession:NM_126190.1
mRNA_length:1464
Description:Arabidopsis thaliana ribose 5-phosphate isomerase A (RPI2) mRNA, complete cds.
Gene_id:814657
Gene:RPI2
Locus_tag: AT2G01290
CDS_length:798
CDS_seq:
atggcgcttgcgtatgatcctctcttcattacatcggacaaatctttgtc
cgcctttgatgttgcctcttcaccgcctcagcccatgaatttaacacaag
acgagctcaaacgtatcgccgcttacaaagccgtggaattcgtcgagtcc
ggcatggttctcggtctcggaaccggctccaccgccaaacacgccgtcga
ccgaatcggcgagcttctccgtcaaggcaaactcgagaacatcgttggta
taccaacgtctaagaagacgcaggagcaggctctgtctctcggcatccct
ctctctgatctcgacgcacaccctgtcatcgatctctccatcgacggcgc
agacgaggtcgacccttttctcaacctggtaaaaggcagaggagggtctc
tgcttcgagagaagatgatcgaaggagcttctaagaagtttgtggtgatt
gttgacgactccaagatggtgaagcatattggtggaagcaagcttgctct
tccggtggagatagtgcccttttgctggaagttcacggcggagaagctcc
ggagtctgctggaaggatacgggtgtgaagcgaatcttcggttgggagag
aaagggaaggcctttgtgacagacaatgggaattacatagtggatatgca
tgtggaagaggatatgggagatttgggagcagtgagtgatgcgattctga
ggctaccgggagtagtggaacatgggatgtttctggatatggcttctact
gttatcattgcaggtgagcttggtgttaagatcaagaacaaacactga
Protein_id: NP_178238.1
AA_seq:
MALAYDPLFITSDKSLSAFDVASSPPQPMNLTQDELKRIAAYKAVEFVES
GMVLGLGTGSTAKHAVDRIGELLRQGKLENIVGIPTSKKTQEQALSLGIP
LSDLDAHPVIDLSIDGADEVDPFLNLVKGRGGSLLREKMIEGASKKFVVI
VDDSKMVKHIGGSKLALPVEIVPFCWKFTAEKLRSLLEGYGCEANLRLGE
KGKAFVTDNGNYIVDMHVEEDMGDLGAVSDAILRLPGVVEHGMFLDMAST
VIIAGELGVKIKNKH
mRNA_seq:
aaaacacaaaccctatcttttcttcttcttcttcttcgttcagtcttcac
tttcttttcttttctccgccaccacacaaatacattcttcattctttgta
actttcacgctattcttcctcaatatattcattcgacctcctctcttcat
tcttaatttctcaaaaatcttatctttctccctcatttttcccgtcgagg
gttccttaatctccttcctaggaacgtttcatcgatttcatatttgcttt
ccttaatccacaacgtttcttccttcgtttttttctgcaattttgttttc
ctttacaccaaattcatctcttagtcttgtccaattttcgattttttttc
cgacaatcctgattgtttgtttttttatctgtcgggatttcgaatggcgc
ttgcgtatgatcctctcttcattacatcggacaaatctttgtccgccttt
gatgttgcctcttcaccgcctcagcccatgaatttaacacaagacgagct
caaacgtatcgccgcttacaaagccgtggaattcgtcgagtccggcatgg
ttctcggtctcggaaccggctccaccgccaaacacgccgtcgaccgaatc
ggcgagcttctccgtcaaggcaaactcgagaacatcgttggtataccaac
gtctaagaagacgcaggagcaggctctgtctctcggcatccctctctctg
atctcgacgcacaccctgtcatcgatctctccatcgacggcgcagacgag
gtcgacccttttctcaacctggtaaaaggcagaggagggtctctgcttcg
agagaagatgatcgaaggagcttctaagaagtttgtggtgattgttgacg
actccaagatggtgaagcatattggtggaagcaagcttgctcttccggtg
gagatagtgcccttttgctggaagttcacggcggagaagctccggagtct
gctggaaggatacgggtgtgaagcgaatcttcggttgggagagaaaggga
aggcctttgtgacagacaatgggaattacatagtggatatgcatgtggaa
gaggatatgggagatttgggagcagtgagtgatgcgattctgaggctacc
gggagtagtggaacatgggatgtttctggatatggcttctactgttatca
ttgcaggtgagcttggtgttaagatcaagaacaaacactgaaacacacac
cacacctcttctcgttgattttggcctttatgtattaggagacagaggat
tatggtttgctctatggtttagatgatatgagattaggttttggatgatg
tcactgtgaggaggatgaatgactttcttttgacagagttttggatgcat
tgacatcgtcgtcatcatcatcaatcaatcttgtaaacagcattttgctt
cttcattgcctttgttgtgtttgaatgctcttcttttactactgggcctg
tgttgggctttgct
. . .
. . .
. . .
程序代码如下:
#!/usr/bin/perl
use strict;
use warnings;
open F,"E:/sequence.gb";
my @a=<F>;
my $i=0;
my @accession;
my @mRNA_length;
my @description;
my $temp;
my $temp1;
my $temp2;
my $temp3;
my $genename;
my $locus_tag_name;
my @b;
my @CDS_length;
my @product;
my @protein_id;
my @AA_seq;
my @CDS_seq;
my @mRNA_seq;
my @gene_name;
my @tag_name;
my $start;
my $gene_id;
my @Gene_id;
while($i<=$#a)
{
if($a[$i]=~/^LOCUS(\s){7}/)
{
$a[$i]=~s/.*NM_(\d)+//;
$a[$i]=~s/(\s)+//g;
$a[$i]=~s/bp.*//s;
push @mRNA_length,$a[$i];$i++;
}elsif($a[$i]=~/DEFINITION/)
{
$a[$i]=~s/DEFINITION //;
$a[$i]=~s/\n//;
$temp1=$a[$i];$i++;
unless($a[$i]=~/ACCESSION/)
{
$a[$i]=~s/(\s){11}//;
$a[$i]=~s/(\s)+$//;
$a[$i]=~s/\n//;
$temp2=$a[$i];
$temp1=$temp1.$temp2;
push @description,$temp1;$i++;
}else
{
push @description,$temp1;$i++;
}
$temp1='';$temp2='';
}
elsif($a[$i]=~/VERSION/)
{
$a[$i]=~s/VERSION//;
$a[$i]=~s/GI.*//s;
$a[$i]=~s/(\s)+//g;
push @accession,$a[$i];$i++;
}
elsif($a[$i]=~/\/gene=/)
{
$a[$i]=~s/.*=//;
$a[$i]=~s/\"//g;
$a[$i]=~s/(\s)+//g;
$genename=$a[$i];$i++;
}elsif($a[$i]=~/\/locus_tag=/)
{
$a[$i]=~s/.*=//;
$a[$i]=~s/\"//g;
$a[$i]=~s/(\s)+//g;
$locus_tag_name=$a[$i];$i++;
}elsif($a[$i]=~/\/db_xref=\"GeneID:/)
{
$a[$i]=~s/\/db_xref=\"GeneID://;
$a[$i]=~s/\"//g;
$a[$i]=~s/(\s)+//g;
$gene_id=$a[$i];$i++;
}
elsif($a[$i]=~/^(\s)+CDS(\s)+(\d)+/)
{
$a[$i]=~s/CDS//;
$a[$i]=~s/(\s)+//g;
$a[$i]=~s/\n//;
@b=split/\.\./,$a[$i];
$temp=int($b[1])-int($b[0])+1;
$start=int($b[0]);
push @CDS_length,$temp;
@b=();$i++;
}
elsif($a[$i]=~/\/product=\"/)
{
$a[$i]=~s/\/product=//;
$a[$i]=~s/\"//g;
$a[$i]=~s/^(\s)+//;
$a[$i]=~s/(\s)+$//;
$a[$i]=~s/\n//;
$temp1=$a[$i];$i++;
while(!($a[$i]=~/\/protein_id=/))
{
$a[$i]=~s/^(\s){21}//;
$a[$i]=~s/(\s)+$//;
$a[$i]=~s/\n//;
$temp2=$a[$i];
$temp1=$temp1.$temp2;
$i++;
}
push @product,$temp1;
$a[$i]=~s/\/protein_id=//;
$a[$i]=~s/\"//g;
$a[$i]=~s/(\s)+//g;
$a[$i]=~s/\n//;
push @protein_id,$a[$i];$i++;
$temp1='';$temp2='';
}
elsif($a[$i]=~/\/translation=/)
{
$a[$i]=~s/\/translation=//;
$a[$i]=~s/\"//g;
$a[$i]=~s/^(\s)+//;
$a[$i]=~s/(\s)+$//;
$a[$i]=~s/\n//;
$temp1=$a[$i];$i++;
while(!($a[$i]=~/ORIGIN/))
{
$a[$i]=~s/(\s)+//g;
$a[$i]=~s/\n//;
$a[$i]=~s/\"//;
$temp2=$a[$i];
$temp1=$temp1.$temp2;
$i++;
}
push @AA_seq,$temp1;
$temp1='';$temp2='';
}
elsif($a[$i]=~/^ORIGIN(\s){6}/)
{
$i++;
while(!($a[$i]=~/\/\//))
{
$a[$i]=~s/(\s)+//g;
$a[$i]=~s/(\d)+//g;
$a[$i]=~s/\n//;
$temp2=$a[$i];
$temp1=$temp1.$temp2;
$i++;
}
$temp3=substr($temp1,$start-1,$temp);
push @CDS_seq,$temp3;
push @mRNA_seq,$temp1;
$temp1='';$temp2='';
}
elsif($a[$i]=~/\/\//)
{
push @gene_name,$genename;
push @tag_name,$locus_tag_name;
push @Gene_id,$gene_id;
$genename='';$locus_tag_name='';$gene_id='';
$i++;
}
else
{
$a[$i]=~s/.*//sg;$i++;
}
}
open IN,">E:/seq_";
my $m;
my $str;
for($i=0;$i<=$#accession;$i++)
{
print IN "Accession:\t$accession[$i]\nmRNA_length:\t$mRNA_length[$i]\n";
print IN "Description:\t$description[$i]\n";
print IN "Gene_id:\t$Gene_id[$i]\nGene:\t$gene_name[$i]\n";
print IN "Locus_tag:\t $tag_name[$i]\nCDS_length:\t$CDS_length[$i]\n";
print IN "CDS_seq:\n";
$m=0;
$str='';
while($m<=length($CDS_seq[$i]))
{
$str=substr($CDS_seq[$i],$m,50);
$m=$m+50;
print IN "$str\n";
}
print IN "Protein_id:\t $protein_id[$i]\n";
$m=0;
$str='';
print IN "AA_seq:\n";
while($m<=length($AA_seq[$i]))
{
$str=substr($AA_seq[$i],$m,50);
$m=$m+50;
print IN "$str\n";
}
$m=0;
$str='';
print IN "mRNA_seq:\n";
while($m<=length($mRNA_seq[$i]))
{
$str=substr($mRNA_seq[$i],$m,50);
$m=$m+50;
print IN "$strn";
}
print IN "n";
}
是不是觉得好长,其实很多都是差不多的,大致思路是将序列文件赋予一个数组,然后遍历这个数组,用正则表达式根据genebank那些特例的格式筛选想要的信息。下一部的计划是将这些结果输入到Excel表中,更进一步的计划是建一个数据库,哈哈,期待中……
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-5-22 09:22
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社