|
PDB 提供的API 返回的是XML。
我习惯使用scrapy的selector 进行解析,里面是非标准的DOM元素及其属性。
而selector的默认参数是解析HTML,因此会出现属性丢失的情况。
非常重要,解析的时候必须指定type的类型,否则虽然不会出错,但会出现莫名奇妙的结果
**
sel = Selector(text=xmlstr,type='xml')
**
<?xml version='1.0' standalone='no' ?> <describeHet> <ligandInfo> <ligand chemicalID="2KQ" type="non-polymer" molecularWeight="869.623"> <chemicalName>(3S)-3-[2-[3-[[(2R)-4-[[[(2R,3S,4R,5R)-5-(6-aminopurin-9-yl)-4-oxidanyl-3-phosphonooxy-oxolan-2-yl]methoxy-oxidanyl-phosphoryl]oxy-oxidanyl-phosphoryl]oxy-3,3-dimethyl-2-oxidanyl-butanoyl]amino]propanoylamino]ethylsulfanyl]-3-oxidanyl-butanoic acid</chemicalName> <formula>C25 H42 N7 O19 P3 S</formula> <InChI>InChI=1S/C25H42N7O19P3S/c1-24(2,19(37)22(38)28-5-4-14(33)27-6-7-55-25(3,39)8-15(34)35)10-48-54(45,46)51-53(43,44)47-9-13-18(50-52(40,41)42)17(36)23(49-13)32-12-31-16-20(26)29-11-30-21(16)32/h11-13,17-19,23,36-37,39H,4-10H2,1-3H3,(H,27,33)(H,28,38)(H,34,35)(H,43,44)(H,45,46)(H2,26,29,30)(H2,40,41,42)/t13-,17-,18-,19+,23-,25+/m1/s1</InChI> <InChIKey>FIZPFHAKAOWUQH-RCVSXOLZSA-N</InChIKey> <smiles>C[C@](CC(=O)O)(O)SCCNC(=O)CCNC(=O)[C@@H](C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)n2cnc3c2ncnc3N)O)OP(=O)(O)O)O</smiles> </ligand> </ligandInfo> </describeHet>
--------------------------------
使用脚本如下
```python
#!python2
#coding: utf-8
fh = open("result.txt")
import requests
from xml.etree import ElementTree as ET
from scrapy.selector import Selector
cscid=[]
for line in fh:
'''
'''
cscid.append(line.split()[0])
cscid=list(set(cscid))
# print '\n'.join(cscid)
# https://www.rcsb.org/pdb/rest/describeHet?chemicalID=NAG
# 非标准
for ligid in cscid:
url = 'https://www.rcsb.org/pdb/rest/describeHet?chemicalID='+ligid
result=requests.get(url)
xmlstr=result.text
# print xmlstr
fw=open("test.xml",'w')
fw.write(xmlstr)
fw.close()
sel = Selector(text=xmlstr,type='xml')
print sel.xpath('//ligand')
formula = sel.xpath('//ligand/formula/text()').extract_first()
mw = sel.xpath('//ligand[@chemicalID="%s"]/@molecularWeight'%ligid).extract_first()
print mw
mw = sel.xpath('//ligand/@chemicalid').extract_first()
print "ID:",mw
mw = sel.css('ligand::attr(type)').extract_first()
print "type:",mw
mw = sel.css('ligand::attr(chemicalID)').extract_first()
print "id:",mw
# print ligid,formula,mw
```
#!python2 #coding: utf-8 fh = open("result.txt") import requests from xml.etree import ElementTree as ET from scrapy.selector import Selector cscid=[] for line in fh: ''' ''' cscid.append(line.split()[0]) cscid=list(set(cscid)) # print '\n'.join(cscid) # https://www.rcsb.org/pdb/rest/describeHet?chemicalID=NAG # 非标准 for ligid in cscid: url = 'https://www.rcsb.org/pdb/rest/describeHet?chemicalID='+ligid result=requests.get(url) xmlstr=result.text # print xmlstr fw=open("test.xml",'w') fw.write(xmlstr) fw.close() sel = Selector(text=xmlstr,type='xml') print sel.xpath('//ligand') formula = sel.xpath('//ligand/formula/text()').extract_first() mw = sel.xpath('//ligand[@chemicalID="%s"]/@molecularWeight'%ligid).extract_first() print mw mw = sel.xpath('//ligand/@chemicalid').extract_first() print "ID:",mw mw = sel.css('ligand::attr(type)').extract_first() print "type:",mw mw = sel.css('ligand::attr(chemicalID)').extract_first() print "id:",mw # print ligid,formula,mw
```
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-10-20 01:01
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社